===================================================================================================================

Automatic Ticket Assignment System

====================================================================================================================

In [1]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
Mounted at /content/drive

**Data Loading...**

In [2]:
# colors to format output
colors = {'PURPLE' : '\033[95m','CYAN' : '\033[96m','DARKCYAN' : '\033[36m','BLUE' : '\033[94m','GREEN' : '\033[92m',
     'YELLOW' : '\033[93m','RED' : '\033[91m','BOLD' : '\033[1m','UNDERLINE' : '\033[4m','END' : '\033[0m'}
In [3]:
from sklearn import preprocessing
import os 
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

class IncidentDataloader(object):
  def __init__(self, project_path):
    self.project_path = project_path
    
  def loadIncidents(self):
    ## Read the data from EXCEL
    return pd.read_excel(self.project_path)

  def getUniqueAssignmentGroups(self, df):
    return df['Assignment group'].unique()

  def getNullRows(self, df):
    return df[df.isnull().any(axis=1)]

  def getNullRowCount(self, df):
    return df.isnull().sum().sum()

  def processNullValues(self, df):
    return df.replace(np.nan, '', regex=True)

  def getDuplicateRows(self, df):
    return df[df.duplicated(keep="last")]

  def removeDuplicateRows(self, df):
      return df.drop_duplicates(keep="last")

  def combineDescription(self, df):
    return pd.DataFrame({"Short description":df['Short description'],
                         "Description":df['Description'],
                        "New_Description":np.where(df['Short description']==df['Description'],
                                                df['Short description'],
                                                df['Short description'] + " " + df['Description']),
                         "Caller": df["Caller"],
                         "Assignment group": df["Assignment group"]},
                         columns=["Short description","Description","New_Description","Caller","Assignment group"])
    
  def encodeAssignmentGroup(self, df):
    label_encoder = preprocessing.LabelEncoder() 
    df['Assignment group'] = label_encoder.fit_transform(df['Assignment group'])
    return df

  def applyPOSTagging(self, ds):
    ds['Description_pos_tagged'] = ds['New_Description'].apply(lambda x: self._word_pos_tagger(nltk.word_tokenize(x)))
    return ds
    
  def _word_pos_tagger(self, description_text):   
    pos_tagged_text = nltk.pos_tag(description_text)
    return pos_tagged_text
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
In [4]:
project_path = '/content/drive/My Drive/Colab Notebooks/Capstone/'
inc_dataLoader = IncidentDataloader(project_path+ "input_data.xlsx")
incidentsData = inc_dataLoader.loadIncidents()
In [5]:
incidentsData.shape
Out[5]:
(8500, 4)
In [6]:
inc_dataLoader.getUniqueAssignmentGroups(incidentsData)
Out[6]:
array(['GRP_0', 'GRP_1', 'GRP_3', 'GRP_4', 'GRP_5', 'GRP_6', 'GRP_7',
       'GRP_8', 'GRP_9', 'GRP_10', 'GRP_11', 'GRP_12', 'GRP_13', 'GRP_14',
       'GRP_15', 'GRP_16', 'GRP_17', 'GRP_18', 'GRP_19', 'GRP_2',
       'GRP_20', 'GRP_21', 'GRP_22', 'GRP_23', 'GRP_24', 'GRP_25',
       'GRP_26', 'GRP_27', 'GRP_28', 'GRP_29', 'GRP_30', 'GRP_31',
       'GRP_33', 'GRP_34', 'GRP_35', 'GRP_36', 'GRP_37', 'GRP_38',
       'GRP_39', 'GRP_40', 'GRP_41', 'GRP_42', 'GRP_43', 'GRP_44',
       'GRP_45', 'GRP_46', 'GRP_47', 'GRP_48', 'GRP_49', 'GRP_50',
       'GRP_51', 'GRP_52', 'GRP_53', 'GRP_54', 'GRP_55', 'GRP_56',
       'GRP_57', 'GRP_58', 'GRP_59', 'GRP_60', 'GRP_61', 'GRP_32',
       'GRP_62', 'GRP_63', 'GRP_64', 'GRP_65', 'GRP_66', 'GRP_67',
       'GRP_68', 'GRP_69', 'GRP_70', 'GRP_71', 'GRP_72', 'GRP_73'],
      dtype=object)
In [7]:
#Check null values in data
inc_dataLoader.getNullRows(incidentsData)
Out[7]:
Short description Description Caller Assignment group
2604 NaN \r\n\r\nreceived from: ohdrnswl.rezuibdt@gmail... ohdrnswl rezuibdt GRP_34
3383 NaN \r\n-connected to the user system using teamvi... qftpazns fxpnytmk GRP_0
3906 NaN -user unable tologin to vpn.\r\n-connected to... awpcmsey ctdiuqwe GRP_0
3910 NaN -user unable tologin to vpn.\r\n-connected to... rhwsmefo tvphyura GRP_0
3915 NaN -user unable tologin to vpn.\r\n-connected to... hxripljo efzounig GRP_0
3921 NaN -user unable tologin to vpn.\r\n-connected to... cziadygo veiosxby GRP_0
3924 NaN name:wvqgbdhm fwchqjor\nlanguage:\nbrowser:mic... wvqgbdhm fwchqjor GRP_0
4341 NaN \r\n\r\nreceived from: eqmuniov.ehxkcbgj@gmail... eqmuniov ehxkcbgj GRP_0
4395 i am locked out of skype NaN viyglzfo ajtfzpkb GRP_0
In [8]:
print("Number of Null values in dataset: ",inc_dataLoader.getNullRowCount(incidentsData))
Number of Null values in dataset:  9
In [9]:
incidentsData.isnull().sum()
Out[9]:
Short description    8
Description          1
Caller               0
Assignment group     0
dtype: int64
In [10]:
incidentsData = inc_dataLoader.processNullValues(incidentsData)
In [11]:
inc_dataLoader.getDuplicateRows(incidentsData)
Out[11]:
Short description Description Caller Assignment group
39 call for ecwtrjnq jpecxuty call for ecwtrjnq jpecxuty olckhmvx pcqobjnd GRP_0
51 call for ecwtrjnq jpecxuty call for ecwtrjnq jpecxuty olckhmvx pcqobjnd GRP_0
126 blank call //gso blank call //gso rbozivdq gmlhrtvp GRP_0
229 call for ecwtrjnq jpecxuty call for ecwtrjnq jpecxuty olckhmvx pcqobjnd GRP_0
230 blank call blank call rbozivdq gmlhrtvp GRP_0
... ... ... ... ...
7215 account locked in ad account locked in ad upiyobvj lwohuizr GRP_0
7772 blank call // loud noise blank call // loud noise rbozivdq gmlhrtvp GRP_0
7847 issue on pricing in distributor_tool we have agreed price with many of the distribu... hbmwlprq ilfvyodx GRP_21
7905 unable to launch outlook unable to launch outlook wjtzrmqc ikqpbflg GRP_0
8092 reset passwords for prgthyuulla ramdntythanjes... the boirqctx bkijgqry GRP_17

83 rows × 4 columns

In [12]:
incidentsData = inc_dataLoader.removeDuplicateRows(incidentsData)
In [13]:
incidentsData = inc_dataLoader.combineDescription(incidentsData)
In [14]:
incidentsData.head(15)
Out[14]:
Short description Description New_Description Caller Assignment group
0 login issue -verified user details.(employee# & manager na... login issue -verified user details.(employee# ... spxjnwir pjlcoqds GRP_0
1 outlook \r\n\r\nreceived from: hmjdrvpb.komuaywn@gmail... outlook \r\n\r\nreceived from: hmjdrvpb.komuay... hmjdrvpb komuaywn GRP_0
2 cant log in to vpn \r\n\r\nreceived from: eylqgodm.ybqkwiam@gmail... cant log in to vpn \r\n\r\nreceived from: eylq... eylqgodm ybqkwiam GRP_0
3 unable to access hr_tool page unable to access hr_tool page unable to access hr_tool page xbkucsvz gcpydteq GRP_0
4 skype error skype error skype error owlgqjme qhcozdfx GRP_0
5 unable to log in to engineering tool and skype unable to log in to engineering tool and skype unable to log in to engineering tool and skype eflahbxn ltdgrvkz GRP_0
6 event: critical:HostName_221.company.com the v... event: critical:HostName_221.company.com the v... event: critical:HostName_221.company.com the v... jyoqwxhz clhxsoqy GRP_1
7 ticket_no1550391- employment status - new non-... ticket_no1550391- employment status - new non-... ticket_no1550391- employment status - new non-... eqzibjhw ymebpoih GRP_0
8 unable to disable add ins on outlook unable to disable add ins on outlook unable to disable add ins on outlook mdbegvct dbvichlg GRP_0
9 ticket update on inplant_874773 ticket update on inplant_874773 ticket update on inplant_874773 fumkcsji sarmtlhy GRP_0
10 engineering tool says not connected and unable... engineering tool says not connected and unable... engineering tool says not connected and unable... badgknqs xwelumfz GRP_0
11 hr_tool site not loading page correctly hr_tool site not loading page correctly hr_tool site not loading page correctly dcqsolkx kmsijcuz GRP_0
12 unable to login to hr_tool to sgxqsuojr xwbeso... unable to login to hr_tool to sgxqsuojr xwbeso... unable to login to hr_tool to sgxqsuojr xwbeso... oblekmrw qltgvspb GRP_0
13 user wants to reset the password user wants to reset the password user wants to reset the password iftldbmu fujslwby GRP_0
14 unable to open payslips unable to open payslips unable to open payslips epwyvjsz najukwho GRP_0
In [15]:
incidentsData = inc_dataLoader.applyPOSTagging(incidentsData)

**EDA**

In [ ]:
# incidentsData.to_csv(project_path + 'input-processed-data.csv', header=True, index=False)
In [ ]:
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud, STOPWORDS 
import seaborn as sns
sns.set_style("whitegrid")
flatui = ['#2E82A8','#00A0B8','#00BDB4','#53D69F','#A5EB84','#F9F871']
In [ ]:
incidentsData.shape
Out[ ]:
(8417, 6)
In [ ]:
incidentsData.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8417 entries, 0 to 8499
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Short description       8417 non-null   object
 1   Description             8417 non-null   object
 2   New_Description         8417 non-null   object
 3   Caller                  8417 non-null   object
 4   Assignment group        8417 non-null   object
 5   Description_pos_tagged  8417 non-null   object
dtypes: object(6)
memory usage: 460.3+ KB
In [ ]:
incidentsData['Assignment group'].unique()
Out[ ]:
array(['GRP_0', 'GRP_1', 'GRP_3', 'GRP_4', 'GRP_5', 'GRP_6', 'GRP_7',
       'GRP_8', 'GRP_9', 'GRP_10', 'GRP_11', 'GRP_12', 'GRP_13', 'GRP_14',
       'GRP_15', 'GRP_16', 'GRP_17', 'GRP_18', 'GRP_19', 'GRP_2',
       'GRP_20', 'GRP_21', 'GRP_22', 'GRP_23', 'GRP_24', 'GRP_25',
       'GRP_26', 'GRP_27', 'GRP_28', 'GRP_29', 'GRP_30', 'GRP_31',
       'GRP_33', 'GRP_34', 'GRP_35', 'GRP_36', 'GRP_37', 'GRP_38',
       'GRP_39', 'GRP_40', 'GRP_41', 'GRP_42', 'GRP_43', 'GRP_44',
       'GRP_45', 'GRP_46', 'GRP_47', 'GRP_48', 'GRP_49', 'GRP_50',
       'GRP_51', 'GRP_52', 'GRP_53', 'GRP_54', 'GRP_55', 'GRP_56',
       'GRP_57', 'GRP_58', 'GRP_59', 'GRP_60', 'GRP_61', 'GRP_32',
       'GRP_62', 'GRP_63', 'GRP_64', 'GRP_65', 'GRP_66', 'GRP_67',
       'GRP_68', 'GRP_69', 'GRP_70', 'GRP_71', 'GRP_72', 'GRP_73'],
      dtype=object)

To analyse top 15 groups based on no of tickets assigned
In [ ]:
incidentsData1 = incidentsData.drop('Caller',axis=1)
incidentsData1 = incidentsData['Assignment group'].value_counts().reset_index()
incidentsData1['percentage'] = (incidentsData1['Assignment group']/incidentsData1['Assignment group'].sum())*100
incidentsData1.head(15)
Out[ ]:
index Assignment group percentage
0 GRP_0 3934 46.738743
1 GRP_8 645 7.663063
2 GRP_24 285 3.386005
3 GRP_12 257 3.053344
4 GRP_9 252 2.993941
5 GRP_2 241 2.863253
6 GRP_19 215 2.554354
7 GRP_3 200 2.376144
8 GRP_6 183 2.174171
9 GRP_13 145 1.722704
10 GRP_10 140 1.663300
11 GRP_5 128 1.520732
12 GRP_14 118 1.401925
13 GRP_25 116 1.378163
14 GRP_33 107 1.271237

To analyse bottom 15 groups based on no of tickets assigned
In [ ]:
incidentsData1.tail(15)
Out[ ]:
index Assignment group percentage
59 GRP_63 3 0.035642
60 GRP_68 3 0.035642
61 GRP_38 3 0.035642
62 GRP_56 3 0.035642
63 GRP_69 2 0.023761
64 GRP_72 2 0.023761
65 GRP_57 2 0.023761
66 GRP_71 2 0.023761
67 GRP_54 2 0.023761
68 GRP_35 1 0.011881
69 GRP_73 1 0.011881
70 GRP_70 1 0.011881
71 GRP_64 1 0.011881
72 GRP_61 1 0.011881
73 GRP_67 1 0.011881
To check the tickets distribution between the groups
In [ ]:
plt.subplots(figsize = (20,8))

sns.countplot(x='Assignment group', data=incidentsData,order = incidentsData['Assignment group'].value_counts().index, palette = sns.color_palette(flatui))
plt.xlabel('Assignment Group') 
plt.ylabel('Count') 
plt.xticks(rotation=90)
plt.title('Tickets Distribution')

plt.show()

Groups having tickets more than 20
In [ ]:
incidentsData2 = incidentsData1.drop('percentage',axis=1)
incidentsData3= incidentsData2.loc[incidentsData2['Assignment group'] > 20 ]

plt.figure(figsize=(12,6))
bars = sns.barplot(x=incidentsData3['index'],y=incidentsData3['Assignment group'], palette = sns.color_palette(flatui))
plt.title('TOP 20 Assignment groups with small number of Tickets')
plt.xlabel('Assignment Group')
plt.xticks(rotation=90)
plt.ylabel('Number of Tickets')
plt.tight_layout()
plt.show()

Groups that are having tickets less than 20
In [ ]:
incidentsData3= incidentsData2.loc[incidentsData2['Assignment group'] < 20 ]

plt.figure(figsize=(12,6))
bars = sns.barplot(x=incidentsData3['index'], y=incidentsData3['Assignment group'],palette = sns.color_palette(flatui))
plt.title('TOP 20 Assignment groups with small number of Tickets')
plt.xlabel('Assignment Group')
plt.xticks(rotation=90)
plt.ylabel('Number of Tickets')
plt.tight_layout()
plt.show()

Visualizing tickets assigment to the groups based on bins
In [ ]:
incidentsData_ag = pd.DataFrame(incidentsData['Assignment group'].value_counts())
incidentsData_ag['percentage'] = (incidentsData_ag['Assignment group']/incidentsData_ag['Assignment group'].sum())*100

incidentsData_bins = pd.DataFrame(columns=['Description','Ticket Count'])
_1_10_ticket = {'Description':'1-10 ticket','Ticket Count':len(incidentsData_ag[incidentsData_ag['Assignment group'] < 11])}
_11_50_ticket = {'Description':'11-50 ticket',
              'Ticket Count':len(incidentsData_ag[(incidentsData_ag['Assignment group'] > 10)& (incidentsData_ag['Assignment group'] < 51) ])}
_51_100_ticket = {'Description':' 51-100 ticket',
              'Ticket Count':len(incidentsData_ag[(incidentsData_ag['Assignment group'] > 50)& (incidentsData_ag['Assignment group'] < 101) ])}
_101_250_ticket = {'Description':' 101-250 ticket',
              'Ticket Count':len(incidentsData_ag[(incidentsData_ag['Assignment group'] > 100)& (incidentsData_ag['Assignment group'] < 251)])}
_251_500_ticket = {'Description':' 251-500 ticket',
              'Ticket Count':len(incidentsData_ag[(incidentsData_ag['Assignment group'] > 250)& (incidentsData_ag['Assignment group'] < 501)])}
_501_above_ticket = {'Description':' >500 ticket',
              'Ticket Count':len(incidentsData_ag[(incidentsData_ag['Assignment group'] > 500)])}

#append row to the dataframe
incidentsData_bins = incidentsData_bins.append([_1_10_ticket,_11_50_ticket,_51_100_ticket,_101_250_ticket,_251_500_ticket,_501_above_ticket], ignore_index=True)
incidentsData_bins
Out[ ]:
Description Ticket Count
0 1-10 ticket 25
1 11-50 ticket 25
2 51-100 ticket 9
3 101-250 ticket 10
4 251-500 ticket 3
5 >500 ticket 2
In [ ]:
plt.figure(figsize=(8, 8))
plt.pie(incidentsData_bins['Ticket Count'],labels=incidentsData_bins['Description'],autopct='%1.1f%%', startangle=90,  colors=flatui, pctdistance=0.80,);

#draw circle
centre_circle = plt.Circle((0,0),0.60,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.title('Assignment Groups Distribution')

plt.axis('equal');
plt.tight_layout()
plt.show()

No of characters in the short descrption
In [ ]:
fig = plt.figure(figsize=(6,9))
text_len=incidentsData['Short description'].str.len()
sns.displot(text_len.dropna(),color='#00A0B8',binwidth=6)
fig.suptitle('Characters in short description')
plt.show()
<Figure size 432x648 with 0 Axes>

No of words in short description
In [ ]:
fig = plt.figure(figsize=(6,9))
text_len=incidentsData['Short description'].str.split().map(lambda x: len(str(x).split(" ")))
sns.displot(text_len.dropna(),color='#00A0B8',binwidth=3)
fig.suptitle('Words in short description')
plt.show()
<Figure size 432x648 with 0 Axes>
In [ ]:
fig = plt.figure(figsize=(6,9))
text_len=incidentsData['Description'].str.split().map(lambda x: len(str(x).split(" ")))
sns.displot(text_len.dropna(),color='#00A0B8',binwidth=20)
fig.suptitle('Words in description')
plt.show()
<Figure size 432x648 with 0 Axes>
In [ ]:
# import matplotlib.colors as mcolors
# def hex_to_rgb(value): 
#     value = value.strip("#") # removes hash symbol if present
#     lv = len(value)
#     return tuple(int(value[i:i + lv // 3], 16) for i in range(0, lv, lv // 3))


# def rgb_to_dec(value):   
#     return [v/256 for v in value]

# def get_continuous_cmap(hex_list, float_list=None):
#     rgb_list = [rgb_to_dec(hex_to_rgb(i)) for i in hex_list]
#     if float_list:
#         pass
#     else:
#         float_list = list(np.linspace(0,1,len(rgb_list)))
        
#     cdict = dict()
#     for num, col in enumerate(['red', 'green', 'blue']):
#         col_list = [[float_list[i], rgb_list[i][num], rgb_list[i][num]] for i in range(len(float_list))]
#         cdict[col] = col_list
#     cmp = mcolors.LinearSegmentedColormap('my_cmp', segmentdata=cdict, N=256)
#     return cmp

# custom_ramp = get_continuous_cmap(flatui)
In [ ]:
def f_word_cloud(column):
    
    comment_words = ' '
    stopwords = set(STOPWORDS)
    
    for val in column: 
         
        val = str(val) 
        
        tokens = val.split() 
         
        for i in range(len(tokens)): 
            tokens[i] = tokens[i].lower() 

        for words in tokens: 
            comment_words = comment_words + words + ' '


    wordcloud = WordCloud(width = 800, height = 800, 
                    background_color ='white',  #colormap=custom_ramp,
                    stopwords = stopwords, 
                    min_font_size = 10).generate(comment_words) 
    
    return wordcloud

Wordcloud of frequently used words in Description
In [ ]:
wordcloud = f_word_cloud(incidentsData['Description'])                       
plt.figure(figsize = (6, 6), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show()

Wordcloud of frequently used words in Short Description
In [ ]:
wordcloud = f_word_cloud(incidentsData['Short description'])         
plt.figure(figsize = (6, 6), facecolor = None) 
plt.imshow(wordcloud,interpolation="bilinear") 
plt.axis("off") 
plt.tight_layout(pad = 0) 

  
plt.show()

Wordcloud of frequently used words in Description for different Groups
In [ ]:
groups = ['GRP_0', 'GRP_8','GRP_24','GRP_12','GRP_9','GRP_2']
fig = plt.figure(figsize = (20, 18), facecolor = None)
i = 0
for group in groups:  
  plt.subplot(4,3,i+1)
  wordcloud = f_word_cloud(incidentsData[incidentsData['Assignment group']==group].Description)
  # plot the WordCloud image                        
   
  plt.imshow(wordcloud,interpolation="bilinear") 
  plt.axis("off") 
  plt.tight_layout(pad = 2)
  plt.title(group)    
  i = i + 1

plt.show()
In [ ]:
groups = ['GRP_19','GRP_3','GRP_6','GRP_13','GRP_10','GRP_5']
fig = plt.figure(figsize = (20, 18), facecolor = None)
i = 0
for group in groups:  
  plt.subplot(4,3,i+1)
  wordcloud = f_word_cloud(incidentsData[incidentsData['Assignment group']==group].Description)
  # plot the WordCloud image                        
   
  plt.imshow(wordcloud,interpolation="bilinear") 
  plt.axis("off") 
  plt.tight_layout(pad = 2)
  plt.title(group)    
  i = i + 1

plt.show()

**Topic Modeling**

Represent a documents with the topics it refers to; far simpler and more intuitive than representing the document with the words it contains

In [ ]:
X = incidentsData['New_Description']

1. Latent Semantic Analysis (LSA)

In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 1000, # keep top 1000 terms 
max_df = 0.5, 
smooth_idf=True)


transformed_vector = vectorizer.fit_transform(X)
In [ ]:
from sklearn.decomposition import TruncatedSVD

# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122)

Y = svd_model.fit_transform(transformed_vector)
print('LSA shape:', Y.shape)
LSA shape: (8417, 20)
In [ ]:
explained_variance = svd_model.explained_variance_ratio_.sum()
print("Sum of explained variance ratio: %d%%" % (int(explained_variance * 100)))
Sum of explained variance ratio: 27%
In [ ]:
terms = vectorizer.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i)+": ",end="")
    for t in sorted_terms:
        print(t[0],end=" ")
    print(" ")
Topic 0: 00 job job_scheduler failed 2016 monitoring_tool 09 com company received  
Topic 1: password reset erp account sid_34 locked user gmail windows password_management_tool  
Topic 2: account locked erp sid_34 unlock unable login issue ad user  
Topic 3: account locked reset sid_34 password unlock 00 windows job_scheduler job  
Topic 4: update ticket yes na account locked reset password circuit site  
Topic 5: yes na site circuit backup power outage _________ cert notified  
Topic 6: unable login outlook connect skype 00 sid_34 failed open launch  
Topic 7: erp sid_34 unlock production error slow log sid_1 printer logon  
Topic 8: outlook crm erp open working sid_34 launch ms emails responding  
Topic 9: user issue login outlook able id resolved confirmed crm caller  
Topic 10: abended 16 08 job job_scheduler login monitoring_tool company outlook 27  
Topic 11: mit probleme login nicht cid skype best bitte gmail image001  
Topic 12: vpn access probleme mit connect user working need printer usa  
Topic 13: internet explorer browser number email microsoft telephone summary customer language  
Topic 14: unlock access account user collaboration_platform unable email probleme mit id  
Topic 15: vpn unlock connect account gmail working com issue user login  
Topic 16: windows printer unlock error working skype print account laptop install  
Topic 17: skype working access audio error meeting unlock account certificate reset  
Topic 18: windows vpn access login erp skype working outlook log space  
Topic 19: issue error crm collaboration_platform account access probleme cid reset engineering_tool  
In [ ]:
import umap

X_topics = svd_model.fit_transform(transformed_vector)
embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(X_topics)

plt.figure(figsize=(15,8))
plt.scatter(embedding[:, 0], embedding[:, 1], 
c = incidentsData['Assignment group'],
s = 10, # size
edgecolor='none'
)
plt.show()

LSA advantages:

LSA is fast and easy to implement and provide better results than a plain vector space model.

LSA disadvantages:

  1. LSA is a linear model, it might not do well on datasets with non-linear dependencies.
  2. It assumes a Gaussian distribution of the terms in the documents, which may not be true for all the problems.
  3. LSA involves SVD, which is computationally intensive and hard to update as new data comes up.

2. Latent Dirichlet Allocation (LDA)

In [ ]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
In [ ]:
count_vectorizer = CountVectorizer(ngram_range=(2, 2))

transformed_vector = count_vectorizer.fit_transform(X)
In [ ]:
transformed_vector.shape
Out[ ]:
(8417, 97198)
In [ ]:
feature_names = count_vectorizer.get_feature_names()
In [ ]:
len(feature_names)
Out[ ]:
97198
In [ ]:
feature_names[1000:1005]
Out[ ]:
['01d22a14 9ba833d0',
 '01d22a3d f7f2ecb0',
 '01d22a56 4f7ef4c0',
 '01d22ab4 97030051',
 '01d22ae8 0c3cb4d0']
In [ ]:
NUM_TOPICS = 20

lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=20)

lda = lda_model.fit_transform(transformed_vector)
In [ ]:
lda.shape
In [ ]:
lda_model.components_.shape
Out[ ]:
(20, 97198)
In [ ]:
def print_topic(identifier, top_words=10):
    
    for topic_id, topic_word_probs in enumerate(lda_model.components_):

        if topic_id == identifier:
            print("Topic %d:" % (topic_id))

            top_feature_names = [feature_names[i] for i in topic_word_probs.argsort()[: -top_words - 1 : -1]]
            
            print(" ".join(top_feature_names))
In [ ]:
for i in range(20):
    print_topic(i, top_words=3)
Topic 0:
yes no no na backup circuit
Topic 1:
ticket update update on gmail com
Topic 2:
gmail com unable to received from
Topic 3:
in job_scheduler failed in job_scheduler at
Topic 4:
ticket and to the in the
Topic 5:
gmail com received from unable to
Topic 6:
unable to gmail com login to
Topic 7:
password reset erp sid_34 account locked
Topic 8:
is not not working in the
Topic 9:
79 63 63 203 18 79
Topic 10:
job in abended job com abended
Topic 11:
gmail com is now free on
Topic 12:
80 216 96 80 54 96
Topic 13:
gmail com received from 00 00
Topic 14:
lock out is not account lock
Topic 15:
the user to the login issue
Topic 16:
gmail com received from access to
Topic 17:
gmail com received from of the
Topic 18:
gmail com unable to received from
Topic 19:
outlook not disk agents media agents
In [ ]:
topics = []
NUM_SAMPLES = 8417
for i in range(NUM_SAMPLES):
    topics.append(lda[i].argmax())
In [ ]:
incidentsData['topic'] = topics

incidentsData.head()
Out[ ]:
Short description Description New_Description Caller Assignment group Description_pos_tagged topic
0 login issue -verified user details.(employee# & manager na... login issue -verified user details.(employee# ... spxjnwir pjlcoqds GRP_0 [(login, JJ), (issue, NN), (-verified, VBD), (... 15
1 outlook \r\n\r\nreceived from: hmjdrvpb.komuaywn@gmail... outlook \r\n\r\nreceived from: hmjdrvpb.komuay... hmjdrvpb komuaywn GRP_0 [(outlook, NN), (received, VBD), (from, IN), (... 1
2 cant log in to vpn \r\n\r\nreceived from: eylqgodm.ybqkwiam@gmail... cant log in to vpn \r\n\r\nreceived from: eylq... eylqgodm ybqkwiam GRP_0 [(cant, JJ), (log, NN), (in, IN), (to, TO), (v... 6
3 unable to access hr_tool page unable to access hr_tool page unable to access hr_tool page xbkucsvz gcpydteq GRP_0 [(unable, JJ), (to, TO), (access, NN), (hr_too... 10
4 skype error skype error skype error owlgqjme qhcozdfx GRP_0 [(skype, NN), (error, NN)] 16
In [ ]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components = 2, init = 'pca', random_state = 0)
In [ ]:
lda_2d_tsne = tsne.fit_transform(lda)
In [ ]:
lda_df_tsne = pd.DataFrame(lda_2d_tsne)

lda_df_tsne.shape
Out[ ]:
(8417, 2)
In [ ]:
plt.subplots(figsize = (12, 10));
plt.axis('equal');

plt.scatter(lda_df_tsne[0], lda_df_tsne[1], c = incidentsData['topic'], cmap = plt.cm.Spectral);
In [ ]:
count_vectorizer = CountVectorizer(max_df=1000, min_df=3)

transformed_vector = count_vectorizer.fit_transform(X)
In [ ]:
feature_names = count_vectorizer.get_feature_names()
In [ ]:
len(feature_names)
Out[ ]:
5710
In [ ]:
feature_names[1000:1005]
Out[ ]:
['arbeiten', 'arbeitsplatz', 'arcgonvy', 'archive', 'archived']
In [ ]:
NUM_TOPICS = 20

lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=100)

lda = lda_model.fit_transform(transformed_vector)
In [ ]:
def get_topic(identifier, top_words=10):
    
    for topic_id, topic_word_probs in enumerate(lda_model.components_):

        if topic_id == identifier:
            top_feature_names = [feature_names[i] for i in topic_word_probs.argsort()[: -top_words - 1 : -1]]
            
            return " ".join(top_feature_names)
In [ ]:
for i in range(20):
    print("Topic %d:" % (i))

    print(get_topic(i, top_words=3))
Topic 0:
this with have
Topic 1:
0x0 asa inside
Topic 2:
installation ms defekt
Topic 3:
cid update image001
Topic 4:
laptop pc need
Topic 5:
password reset account
Topic 6:
order delivery we
Topic 7:
report uacyltoe hxgaycze
Topic 8:
access user device
Topic 9:
this or ticket
Topic 10:
server space over
Topic 11:
no yes na
Topic 12:
00 job job_scheduler
Topic 13:
unable outlook vpn
Topic 14:
down usa since
Topic 15:
nicht mit die
Topic 16:
printer print if
Topic 17:
event ip id
Topic 18:
sent this you
Topic 19:
monitor now 15
In [ ]:
topic_id_topic_mapping = {}

for i in range(20):
    topic_id_topic_mapping[i] = get_topic(i, top_words=3)
    
topic_id_topic_mapping
Out[ ]:
{0: 'this with have',
 1: '0x0 asa inside',
 2: 'installation ms defekt',
 3: 'cid update image001',
 4: 'laptop pc need',
 5: 'password reset account',
 6: 'order delivery we',
 7: 'report uacyltoe hxgaycze',
 8: 'access user device',
 9: 'this or ticket',
 10: 'server space over',
 11: 'no yes na',
 12: '00 job job_scheduler',
 13: 'unable outlook vpn',
 14: 'down usa since',
 15: 'nicht mit die',
 16: 'printer print if',
 17: 'event ip id',
 18: 'sent this you',
 19: 'monitor now 15'}
In [ ]:
topic_to_doc_mapping = {}
topic_list = []
topic_names = []

for i in range(NUM_SAMPLES):
    most_likely_topic =  lda[i].argmax()

    if most_likely_topic not in topic_to_doc_mapping:
        topic_to_doc_mapping[most_likely_topic] = []

    topic_to_doc_mapping[most_likely_topic].append(i)
    
    topic_list.append(most_likely_topic)
    topic_names.append(topic_id_topic_mapping[most_likely_topic])

incidentsData['Most_Likely_Topic'] = topic_list
incidentsData['Most_Likely_Topic_Names'] = topic_names
In [ ]:
print(topic_to_doc_mapping[0][:100])
In [ ]:
topic_of_interest = 19

doc_ids = topic_to_doc_mapping[topic_of_interest][:4]

for doc_index in doc_ids:
    print(X.iloc[doc_index])
outlook收到箱中folder变为每天一个folder,office提示更新。
ie浏览器打开crm系统后提示用户已被注销,无法登录crm系统。
msd - office 2013 outlook 打不开,显示无法创建新的堆栈防护页面 please provide details of the issue.
[‎2016/‎10/‎25 9:36] melhduty gqchtedl: 
hi 
我的电脑邮箱打不开了
你可以帮我吗?
谢谢
[‎2016/‎10/‎25 9:38] melhduty gqchtedl: 
显示系统错误
显示无法创建新的堆栈防护页面
[‎2016/‎10/‎25 9:40] obuwfnkm ufpwmybi: 
把你桌面共享给我
[‎2016/‎10/‎25 9:41] melhduty gqchtedl: 
id 

看不见a3 的文件夹(\\HostName_17\teams\business\ a3 ),请帮忙弄一下。sahtym wanthryg 
In [ ]:
from sklearn.manifold import Isomap

isomap = Isomap(n_neighbors = 20, n_components = 2)
In [ ]:
lda_2d_isomap = isomap.fit_transform(lda)
In [ ]:
lda_df_isomap = pd.DataFrame(lda_2d_isomap)

lda_df_isomap.head()
Out[ ]:
0 1
0 0.170935 0.037641
1 -0.457801 0.132597
2 0.017233 0.383353
3 0.048791 0.525409
4 0.154829 0.469272
In [ ]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder() 
incidentsData['Assignment group'] = label_encoder.fit_transform(incidentsData['Assignment group'])
In [ ]:
plt.subplots(figsize = (8, 8))
plt.axis('equal');

plt.scatter(lda_df_isomap[0], lda_df_isomap[1], c = incidentsData['Assignment group'], cmap = plt.cm.Spectral)
Out[ ]:
<matplotlib.collections.PathCollection at 0x7f4f524c0048>
In [ ]:
plt.subplots(figsize = (8, 8))
plt.axis('equal');

plt.scatter(lda_df_tsne[0], lda_df_tsne[1], c = incidentsData['Most_Likely_Topic'], cmap = plt.cm.Spectral)
Out[ ]:
<matplotlib.collections.PathCollection at 0x7f4f52834208>
In [ ]:
!pip install pyldavis -q
     |████████████████████████████████| 1.6MB 12.8MB/s 
  Building wheel for pyldavis (setup.py) ... done
In [ ]:
import pyLDAvis.sklearn
 
pyLDAvis.enable_notebook()
In [ ]:
panel = pyLDAvis.sklearn.prepare(lda_model, transformed_vector, count_vectorizer, mds='tsne')

panel
Out[ ]:
In [ ]:
 

**Feature Transformation using NLP Pipeline**

In [16]:
!pip install normalise -q
!sudo apt install openjdk-8-jdk -q
!sudo update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java 
!pip install language-check -q
!pip install pycontractions -q

import numpy as np
import multiprocessing as mp
import re 

import nltk
for dependency in ("brown", "names", "wordnet", "averaged_perceptron_tagger", "universal_tagset"):
    nltk.download(dependency,quiet=True)

import string
import spacy 
import en_core_web_sm
from nltk.tokenize import word_tokenize
from sklearn.base import TransformerMixin, BaseEstimator
from normalise import normalise
from pycontractions import Contractions

cont = Contractions(api_key="glove-twitter-100") #A Python library for expanding and creating common English contractions in text.
nlp = en_core_web_sm.load()

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                #  key,
                 variety="BrE",
                 user_abbrevs={},
                 n_jobs=1):
   
        # self.key = key
        self.variety = variety
        self.user_abbrevs = user_abbrevs
        self.n_jobs = n_jobs
       

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()

        return data

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        
        doc = cont.expand_texts(text) #I'd -> I would , I'd -> I had
        doc = self._remove_email_and_format(text)
        doc = nlp(doc)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)       
        return self._lemmatize(removed_stop_words)

    def _normalize(self, text):       
        try:
            return ' '.join(normalise(text, variety=self.variety, user_abbrevs=self.user_abbrevs, verbose=False))
        except:
            return text

    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
      customize_stop_words = ['hi','hello','regards','dear','best','thanks','please','appreciate', 'great', 'day']
      for w in customize_stop_words:
        nlp.vocab[w].is_stop = True

      return [t for t in doc if not t.is_stop]

    def _remove_email_and_format(self, doc):     
      doc = ' '.join(re.sub("[^\u0030-\u0039\u0041-\u005a\u0061-\u007a]", " ", doc).split())
      return doc.strip()

    def _lemmatize(self, doc):
      return ' '.join([t.lemma_ for t in doc])
In [28]:
text = TextPreprocessor(n_jobs=-1).transform(incidentsData['New_Description'])

**Handling Imbalanced Data**

In [17]:
import seaborn as sns
import matplotlib.pyplot as plt
#add to remove warning for python 3.6 dependency
import warnings
import pandas.util.testing as tm
%matplotlib inline
sns.set_style("whitegrid")
flatui = ['#2E82A8','#00A0B8','#00BDB4','#53D69F','#A5EB84','#F9F871']

descending_order = incidentsData['Assignment group'].value_counts().sort_values(ascending=False).index
plt.subplots(figsize=(22,5))

ax=sns.countplot(x='Assignment group', data=incidentsData, palette = sns.color_palette(flatui),order=descending_order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()
plt.show()
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:6: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  
In [18]:
incidentsData[incidentsData['Assignment group']=='GRP_0'].shape
Out[18]:
(3934, 6)
  • We can see that data is highly imbalanced.
  • From all data, ~50% of the data belongs to GRP_0
  • Also, some groups have very few count e.g. groups with 1 entry
  • We can merge such group with small count of tickets
In [19]:
incidentsData_Group_minor = incidentsData.copy()
incidentsData_Group_minor['Assignment group'] = incidentsData_Group_minor['Assignment group'].apply(lambda x : 'other' if x not in ['GRP_0'] else x)#,'GRP_8' ,'GRP_9','GRP_12','GRP_24']

descending_order = incidentsData_Group_minor['Assignment group'].value_counts().sort_values(ascending=False).index
plt.subplots(figsize=(5,5))
sns.countplot(x='Assignment group', data=incidentsData_Group_minor, palette = sns.color_palette(flatui),order=descending_order)

incidentsData_Others = incidentsData[incidentsData['Assignment group']!='GRP_0']
max_incident_cnt = incidentsData['Assignment group'].value_counts().max()

**1. Up-sample Minority Class**

Up-sampling is the process of randomly duplicating observations from the minority class in order to reinforce its signal. We will use resample with replacement.
We'll create a new DataFrame with an up-sampled minority class. Here are the steps:

  • First, we'll separate observations from each class into different DataFrames.
  • Next, we'll resample the minority class with replacement, setting the number of samples to match that of the majority class.
  • Finally, we'll combine the up-sampled minority class DataFrame with the original majority class DataFrame.
In [20]:
# Treat the imbalnce in the 'other' dataset by resampling
from sklearn.utils import resample

incidentsData_upsampled = incidentsData_Others[0:0]

# Upsample minority class
for grp in incidentsData_Others['Assignment group'].unique():
    incidentsData_Group = incidentsData_Others[incidentsData_Others['Assignment group'] == grp]
    resampled = resample(incidentsData_Group, 
                         replace=True, # sample with replacement
                         n_samples=int(max_incident_cnt/2), 
                         random_state=123) # reproducible results
    
    incidentsData_upsampled = incidentsData_upsampled.append(resampled)

incidentsData_Others_upsample = pd.concat([incidentsData_Group_minor[incidentsData_Group_minor['Assignment group']=='GRP_0'],incidentsData_upsampled])
incidentsData_Others_upsample.reset_index(inplace=True)

descending_order = incidentsData_upsampled['Assignment group'].value_counts().sort_values(ascending=False).index
plt.subplots(figsize=(22,5))
#add code to rotate the labels
ax=sns.countplot(x='Assignment group', data=incidentsData_upsampled, palette = sns.color_palette(flatui),order=descending_order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()
plt.show()
In [21]:
incidentsData_upsampled[incidentsData_upsampled['Assignment group']=='GRP_0'].shape
Out[21]:
(0, 6)
In [22]:
incidentsData_Others_upsample[incidentsData_Others_upsample['Assignment group']=='other'].shape
Out[22]:
(0, 7)
In [23]:
inc_dataLoader.getUniqueAssignmentGroups(incidentsData_Others_upsample)
Out[23]:
array(['GRP_0', 'GRP_1', 'GRP_3', 'GRP_4', 'GRP_5', 'GRP_6', 'GRP_7',
       'GRP_8', 'GRP_9', 'GRP_10', 'GRP_11', 'GRP_12', 'GRP_13', 'GRP_14',
       'GRP_15', 'GRP_16', 'GRP_17', 'GRP_18', 'GRP_19', 'GRP_2',
       'GRP_20', 'GRP_21', 'GRP_22', 'GRP_23', 'GRP_24', 'GRP_25',
       'GRP_26', 'GRP_27', 'GRP_28', 'GRP_29', 'GRP_30', 'GRP_31',
       'GRP_33', 'GRP_34', 'GRP_35', 'GRP_36', 'GRP_37', 'GRP_38',
       'GRP_39', 'GRP_40', 'GRP_41', 'GRP_42', 'GRP_43', 'GRP_44',
       'GRP_45', 'GRP_46', 'GRP_47', 'GRP_48', 'GRP_49', 'GRP_50',
       'GRP_51', 'GRP_52', 'GRP_53', 'GRP_54', 'GRP_55', 'GRP_56',
       'GRP_57', 'GRP_58', 'GRP_59', 'GRP_60', 'GRP_61', 'GRP_32',
       'GRP_62', 'GRP_63', 'GRP_64', 'GRP_65', 'GRP_66', 'GRP_67',
       'GRP_68', 'GRP_69', 'GRP_70', 'GRP_71', 'GRP_72', 'GRP_73'],
      dtype=object)
In [24]:
inc_dataLoader.getUniqueAssignmentGroups(incidentsData_upsampled)
Out[24]:
array(['GRP_1', 'GRP_3', 'GRP_4', 'GRP_5', 'GRP_6', 'GRP_7', 'GRP_8',
       'GRP_9', 'GRP_10', 'GRP_11', 'GRP_12', 'GRP_13', 'GRP_14',
       'GRP_15', 'GRP_16', 'GRP_17', 'GRP_18', 'GRP_19', 'GRP_2',
       'GRP_20', 'GRP_21', 'GRP_22', 'GRP_23', 'GRP_24', 'GRP_25',
       'GRP_26', 'GRP_27', 'GRP_28', 'GRP_29', 'GRP_30', 'GRP_31',
       'GRP_33', 'GRP_34', 'GRP_35', 'GRP_36', 'GRP_37', 'GRP_38',
       'GRP_39', 'GRP_40', 'GRP_41', 'GRP_42', 'GRP_43', 'GRP_44',
       'GRP_45', 'GRP_46', 'GRP_47', 'GRP_48', 'GRP_49', 'GRP_50',
       'GRP_51', 'GRP_52', 'GRP_53', 'GRP_54', 'GRP_55', 'GRP_56',
       'GRP_57', 'GRP_58', 'GRP_59', 'GRP_60', 'GRP_61', 'GRP_32',
       'GRP_62', 'GRP_63', 'GRP_64', 'GRP_65', 'GRP_66', 'GRP_67',
       'GRP_68', 'GRP_69', 'GRP_70', 'GRP_71', 'GRP_72', 'GRP_73'],
      dtype=object)
In [25]:
##Pre-Processing label Encoding on  Assignment Group
import seaborn as sns
import matplotlib.pyplot as plt
#add to remove warning for python 3.6 dependency
import warnings
import pandas.util.testing as tm
%matplotlib inline

descending_order = incidentsData_Others_upsample['Assignment group'].value_counts().sort_values(ascending=False).index
plt.subplots(figsize=(22,5))

ax=sns.countplot(x='Assignment group', data=incidentsData_Others_upsample, palette = sns.color_palette(flatui))
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()
plt.show()
In [26]:
incidentsData_Others_upsample.shape
Out[26]:
(147525, 7)

**Using SMOTE**

Oversampling increases the weight of the minority class by replicating the minority class examples. Although it does not increase information, it raises the over-fitting issue, which causes the model to be too specific. It may well be the case that the accuracy for the training set is high, yet the performance for new datasets is actually worse.

Synthetic Minority Over-sampling Technique (SMOTE). This method is considered a state-of-art technique and one of the most commonly used oversampling methods to solve the imbalance problem. This method generates synthetic data based on the feature space similarities between existing minority instances. In order to create a synthetic instance, it finds the K-nearest neighbors of each minority instance, randomly selects one of them, and then calculate linear interpolations to produce a new minority instance in the neighborhood.

However, we have challenge to use SMOTE technique for our dataset. As we are having 74 classes and 8417 samples in the training set. so on average each sample should have atleast 113 samples per class. However, our dataset is extremely imbalanced. There are classes with only one samples and the number of neighbors is 6. So, we cannot use only SMOTE.

We will try to solve this problem by combining RandomOverSampler and SMOTE.

Random oversampling for the minority class </br> Random oversampling simply replicates randomly the minority class examples. Random oversampling is known to increase the likelihood of occurring overfitting. However, we will use SMOTE to overcome this issue as shown below

``` from imblearn.over_sampling import SMOTE from imblearn.over_sampling import RandomOverSampler ros = RandomOverSampler(random_state=777) X_ROS, y_ROS = ros.fit_sample(vectorized_df, y) smote = SMOTE() X_smote,y_smote = smote.fit_sample(X_ROS, y_ROS) ```

**ML Modeling**

Model Output

In [27]:
# Create DataFrame to store ModelPerformance result
resultColumn_names = ('Model', 'Train_Acc', 'Test_Acc' ,'Precision', 'Recall', 'F1_Score')
resultsDf = pd.DataFrame(columns = resultColumn_names)
In [28]:
# Store result in CSV
def save_model_result(overwrite=False):
  global resultsDf
  if 'New_ID' in resultsDf:
      resultsDf.drop('New_ID',axis=1)
  if 'New_ID' not in resultsDf:
      resultsDf.insert(0, 'New_ID', range(0,  len(resultsDf)))
  resultsDf = resultsDf.set_index('New_ID')

  if overwrite == True:
    resultsDf.to_csv(project_path + 'model-output.csv', header=True, index=True)
  else:   
    resultsDf.to_csv(project_path + 'model-output.csv', header=False, index=True, mode='a')
save_model_result()
In [29]:
def get_model_result_fromFile():
  df = pd.read_csv(project_path + 'model-output.csv')
  df['New_ID'] = range(1, 1+len(df))
  return df
In [30]:
def update_model_score(model_name, train_accuracy, accuracy, precision, recall, f1_score):
  global resultsDf
  #Store the accuracy results for each model in a dataframe for final comparison  
  tempResultsDf = pd.DataFrame({'Model':[model_name], 'Train_Acc':train_accuracy,'Test_Acc': accuracy, 'Precision': precision,
                                'Recall':recall, 'F1_Score':f1_score})
  tempResultsDf = tempResultsDf[['Model', 'Train_Acc','Test_Acc', 'Precision', 'Recall', 'F1_Score']]
  resultsDf = resultsDf.append(tempResultsDf)
  return resultsDf
In [31]:
# Function To display Model Performance
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

def PrintModelPerformanceReport(model, x_train, y_train, x_test, y_test,  modelType):    
    # Fit the model
    model.fit(x_train, y_train)
    
    #predict on test 
    y_predict = model.predict(x_test)
    avg_method='weighted'

    print(colors.get('BOLD') + colors.get('DARKCYAN')+ '--'*30 + colors.get('END'))
    print(colors.get('BOLD') + colors.get('BLUE')+'\t\t\t'+modelType + colors.get('END'))
    print(colors.get('BOLD') + colors.get('DARKCYAN')+ '--'*30 + colors.get('END'))

    acc_Test = accuracy_score(y_test,y_predict,normalize=True)
    precision = precision_score(y_test, y_predict, average=avg_method, labels=np.unique(y_predict))
    recall = recall_score(y_test, y_predict, average=avg_method, labels=np.unique(y_predict))
    accuracy_count = accuracy_score(y_test, y_predict,normalize=False)
    f1_score = 2 * (precision * recall) / (precision + recall)
   
    acc_Train = model.score(x_train, y_train)   

    print('Train Accuracy Score: \t',acc_Train)
    print('Test Accuracy Score: \t',acc_Test)
    print('Precision: \t\t',precision)
    print('Recall: \t\t',recall)  
    print('F1 Score: \t\t',f1_score) 
    print('Accuracy Count: \t',accuracy_count)
    print("Length of testing data: ", len(y_test))
    print(colors.get('BOLD') + colors.get('DARKCYAN')+ '--'*30 + colors.get('END'))  

    update_model_score(modelType, acc_Train, acc_Test, precision, recall, f1_score)

1. ML Models

In [ ]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import ExtraTreesClassifier 
In [ ]:
x_train, x_test, y_train, y_test = train_test_split(incidentsData['New_Description'], incidentsData['Assignment group'], test_size = 0.2,random_state=0)
In [ ]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline

tf_pipeline = Pipeline([                         
                          ('to_dense', TextPreprocessor()),
                          ('vect', TfidfVectorizer()),
                          ('tfidf', TfidfTransformer()) ])

pipeline_ct = make_pipeline(
     TextPreprocessor(),
     CountVectorizer(ngram_range=(2,2)), 
     TfidfTransformer(),
     FunctionTransformer(lambda x: x.todense(), accept_sparse=True))
In [ ]:
models = []

models.append(('Naïve Bayes - Original Data', GaussianNB(), pipeline_ct))
models.append(('SVM - Original Data', SVC(C=1.0, kernel='linear', degree=3, gamma='auto'), tf_pipeline))
models.append(('Multinomial Naïve Bayes - Original Data',  MultinomialNB(), pipeline_ct))
models.append(('SGD Classifier - Original Data', SGDClassifier(loss='hinge', verbose=0,random_state=1, learning_rate='invscaling',eta0=1),pipeline_ct))
models.append(('Decision Tree - Original Data', DecisionTreeClassifier(), tf_pipeline))
models.append(('Random Forest - Original Data', RandomForestClassifier(),pipeline_ct))
models.append(('AdaBoost - Original Data', AdaBoostClassifier(),tf_pipeline))
models.append(('Bagging - Original Data', BaggingClassifier(n_estimators=100),tf_pipeline))
models.append(('Gradient Boosting - Original Data', GradientBoostingClassifier(),tf_pipeline))
models.append(('ExtraTrees - Original Data',ExtraTreesClassifier(n_estimators = 100),tf_pipeline))
In [ ]:
for name, model, pipeline in models:
   pipe = Pipeline([ ('features', pipeline),
                    ('model',model)
                  ])    
   PrintModelPerformanceReport(pipe, x_train, y_train, x_test, y_test, name)
save_model_result()
------------------------------------------------------------
			Decision Tree - Original Data
------------------------------------------------------------
Train Accuracy Score: 	 0.9928709342046635
Test Accuracy Score: 	 0.5896674584323041
Precision: 		 0.5951454112657192
Recall: 		 0.5953237410071942
F1 Score: 		 0.5952345627797491
Accuracy Count: 	 993
Length of testing data:  1684
------------------------------------------------------------
------------------------------------------------------------
			Random Forest - Original Data
------------------------------------------------------------
Train Accuracy Score: 	 0.9901975345314125
Test Accuracy Score: 	 0.6051068883610451
Precision: 		 0.6538515389202167
Recall: 		 0.6341008089607966
F1 Score: 		 0.6438247353664897
Accuracy Count: 	 1019
Length of testing data:  1684
------------------------------------------------------------
------------------------------------------------------------
			AdaBoost - Original Data
------------------------------------------------------------
Train Accuracy Score: 	 0.5138868260804991
Test Accuracy Score: 	 0.5237529691211401
Precision: 		 0.5209371898605262
Recall: 		 0.9504310344827587
F1 Score: 		 0.6729992724705817
Accuracy Count: 	 882
Length of testing data:  1684
------------------------------------------------------------
------------------------------------------------------------
			Bagging - Original Data
------------------------------------------------------------
Train Accuracy Score: 	 0.9925738897965246
Test Accuracy Score: 	 0.6656769596199525
Precision: 		 0.6580280938132603
Recall: 		 0.6843711843711844
F1 Score: 		 0.6709411621876901
Accuracy Count: 	 1121
Length of testing data:  1684
------------------------------------------------------------
------------------------------------------------------------
			Gradient Boosting - Original Data
------------------------------------------------------------
Train Accuracy Score: 	 0.9829199465320065
Test Accuracy Score: 	 0.6306413301662708
Precision: 		 0.6135893023755332
Recall: 		 0.6370725854829035
F1 Score: 		 0.625110474835671
Accuracy Count: 	 1062
Length of testing data:  1684
------------------------------------------------------------
In [ ]:
save_model_result(True)
In [ ]:
resultsDf
In [ ]:
#Try Good performing models on upsampled data
x_train, x_test, y_train, y_test = train_test_split(incidentsData_Others_upsample['New_Description'], incidentsData_Others_upsample['Assignment group'], test_size = 0.2,random_state=0)
In [ ]:
models = []
models.append(('Bagging with Over-Sampling', BaggingClassifier(n_estimators=10),tf_pipeline))
models.append(('ExtraTrees with Over-Sampling',ExtraTreesClassifier(n_estimators = 10),tf_pipeline))
In [ ]:
for name, model, pipeline in models:
   pipe = Pipeline([ ('features', pipeline),
                    ('model',model)
                  ]) 
   PrintModelPerformanceReport(pipe, x_train, y_train, x_test, y_test, name)
save_model_result()
------------------------------------------------------------
			Bagging with Over-Sampling
------------------------------------------------------------
Train Accuracy Score: 	 0.9906032875783766
Test Accuracy Score: 	 0.9826470089815286
Precision: 		 0.9841404299176464
Recall: 		 0.9826470089815286
F1 Score: 		 0.9833931524574336
Accuracy Count: 	 28993
Length of testing data:  29505
------------------------------------------------------------
------------------------------------------------------------
			ExtraTrees with Over-Sampling
------------------------------------------------------------
Train Accuracy Score: 	 0.9908998474834774
Test Accuracy Score: 	 0.9890188103711235
Precision: 		 0.9904342737860568
Recall: 		 0.9890188103711235
F1 Score: 		 0.9897260359951939
Accuracy Count: 	 29181
Length of testing data:  29505
------------------------------------------------------------

**Word Embeddings**

The word embeddings we explored are

  • Word2vec
  • TF-IDF
  • Weighted word2vec
  • Pre-train GloVe word2vec,
  • Doc2vec and
  • FastText
In [ ]:
import multiprocessing
import sys
import gensim
from gensim.models.word2vec import Word2Vec

workers = multiprocessing.cpu_count()
print('number of cpu: {}'.format(workers))
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be slow otherwise."
number of cpu: 2
In [ ]:
doc_words=[]
for i in text:
    li = list(i.split(" "))
    doc_words.append(li)

Here we have chosen the dimension size 100 for each word embedding and window size of 5. The training iterates for 100 times.

In [ ]:
word_model = Word2Vec(doc_words,
                      min_count=2,
                      size=300,
                      window=5,
                      workers=workers,
                      iter=100)

Averaging Word Embedding for Each Doc

We will be using the word embedding to compute for representative vector for whole text. It then serves as feature input for text classification model.

1. Simple Averaging on Word Embedding

This is a straightforward method. It directly averages all word embedding occurred in the text.

In [ ]:
class MeanEmbeddingVectorizer(object):

	def __init__(self, word_model):
		self.word_model = word_model
		self.vector_size = word_model.wv.vector_size

	def fit(self):  # comply with scikit-learn transformer requirement
		return self

	def transform(self, docs):  # comply with scikit-learn transformer requirement
		doc_word_vector = self.word_average_list(docs)
		return doc_word_vector

	def word_average(self, sent):
		"""
		Compute average word vector for a single doc/sentence.
		:param sent: list of sentence tokens
		:return:
			mean: float of averaging word vectors
		"""
		mean = []
		for word in sent:
			if word in self.word_model.wv.vocab:
				mean.append(self.word_model.wv.get_vector(word))

		if not mean:  # empty words
			# If a text is empty, return a vector of zeros.
			# print("cannot compute average owing to no vector for {}".format(sent))
			return np.zeros(self.vector_size)
		else:
			mean = np.array(mean).mean(axis=0)
			return mean


	def word_average_list(self, docs):
		"""
		Compute average word vector for multiple docs, where docs had been tokenized.
		:param docs: list of sentence in list of separated tokens
		:return:
			array of average word vector in shape (len(docs),)
		"""
		return np.vstack([self.word_average(sent) for sent in docs])
In [ ]:
mean_vec_tr = MeanEmbeddingVectorizer(word_model)
doc_vec = mean_vec_tr.transform(text)
In [ ]:
# Save word averaging doc2vec.
print('Shape of word-mean doc2vec...')
display(doc_vec.shape)
print('Save word-mean doc2vec as csv file...')
np.savetxt(os.path.join(project_path, 'Embeddings/doc_vec.csv'), doc_vec, delimiter=',')
Shape of word-mean doc2vec...
(8417, 300)
Save word-mean doc2vec as csv file...

2. TF-IDF Weighted Averaging on Word Embedding

We can further adopt TF-IDF as weights for each word embedding. This will amplify the role of significant word in computing doc vector. Here, the whole process is implemented under class TfidfEmbeddingVectorizer.

In [ ]:
from collections import defaultdict 
from sklearn.feature_extraction.text import TfidfVectorizer
class TfidfEmbeddingVectorizer(object):

	def __init__(self, word_model):

		self.word_model = word_model
		self.word_idf_weight = None
		self.vector_size = word_model.wv.vector_size

	def fit(self, docs):  # comply with scikit-learn transformer requirement
		"""
		Fit in a list of docs, which had been preprocessed and tokenized,
		such as word bi-grammed, stop-words removed, lemmatized, part of speech filtered.
		Then build up a tfidf model to compute each word's idf as its weight.
		Noted that tf weight is already involved when constructing average word vectors, and thus omitted.
		:param
			pre_processed_docs: list of docs, which are tokenized
		:return:
			self
		"""
		tfidf = TfidfVectorizer()
		tfidf.fit(docs)  # must be list of text string

		# if a word was never seen - it must be at least as infrequent
		# as any of the known words - so the default idf is the max of
		# known idf's
		max_idf = max(tfidf.idf_)  # used as default value for defaultdict
		self.word_idf_weight = defaultdict(lambda: max_idf,
						   [(word, tfidf.idf_[i]) for word, i in tfidf.vocabulary_.items()])
		return self


	def transform(self, docs):  # comply with scikit-learn transformer requirement
		doc_word_vector = self.word_average_list(docs)
		return doc_word_vector


	def word_average(self, sent):
		"""
		Compute average word vector for a single doc/sentence.
		:param sent: list of sentence tokens
		:return:
			mean: float of averaging word vectors
		"""

		mean = []
		for word in sent:
			if word in self.word_model.wv.vocab:
				mean.append(self.word_model.wv.get_vector(word) * self.word_idf_weight[word])  # idf weighted

		if not mean:  # empty words
			# If a text is empty, return a vector of zeros.
			print("cannot compute average owing to no vector for {}".format(sent))
			return np.zeros(self.vector_size)
		else:
			mean = np.array(mean).mean(axis=0)
			return mean


	def word_average_list(self, docs):
		"""
		Compute average word vector for multiple docs, where docs had been tokenized.
		:param docs: list of sentence in list of separated tokens
		:return:
			array of average word vector in shape (len(docs),)
		"""
		return np.vstack([self.word_average(sent) for sent in docs])
In [ ]:
tfidf_vec_tr = TfidfEmbeddingVectorizer(word_model)
tfidf_vec_tr.fit(text)
tfidf_doc_vec = tfidf_vec_tr.transform(text)
In [ ]:
# Save tfidf word averaging doc2vec.
print('Shape of tfidf-word-mean doc2vec...')
display(tfidf_doc_vec.shape)
print('Save tfidf-word-mean doc2vec as csv file...')
np.savetxt(os.path.join(project_path, 'Embeddings/tfidf_doc_vec.csv'), tfidf_doc_vec, delimiter=',')
Shape of tfidf-word-mean doc2vec...
(8417, 300)
Save tfidf-word-mean doc2vec as csv file...

3. Pre-train GloVe Word Embedding

Leveraging the existing pre-trained word embedding and see how it performs in text classification. Here we have used simple averaging on GloVe word vector

In [ ]:
from gensim.test.utils import get_tmpfile, datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# Load in GloVe vector.
glove_vec_fi = datapath(project_path + 'glove/glove.6B.300d.txt')
tmp_word2vec_fi = get_tmpfile('glove.6B.300d.txt')

glove2word2vec(glove_vec_fi, tmp_word2vec_fi)

glove_word_model = KeyedVectors.load_word2vec_format(tmp_word2vec_fi)
In [ ]:
# Apply word averaging on GloVe word vector.
glove_mean_vec_tr = MeanEmbeddingVectorizer(glove_word_model)
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:5: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).
  """
In [ ]:
glove_doc_vec = glove_mean_vec_tr.transform(text)
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:23: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:24: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).
In [ ]:
# Examine if glove_doc_vec is equal to self-trained doc_vec.
print('Examine if glove_doc_vec is equal to self-trained doc_vec...')
glove_doc_vec[4] == doc_vec[4]
In [ ]:
# Save glove word averaging doc2vec.
print('Shape of glove-word-mean doc2vec...')
display(glove_doc_vec.shape)

print('Save glove-word-mean doc2vec as csv file...')
np.savetxt(os.path.join(project_path, 'Embeddings/glove_doc_vec.csv'), glove_doc_vec, delimiter=',')
Shape of glove-word-mean doc2vec...
(8417, 300)
Save glove-word-mean doc2vec as csv file...

4. Apply Doc2vec

We will directly train doc2vec,without doing average all word embeddings. Here we have chosen PV-DM model to train doc2vec. The class just needs to take in the TaggedDocument and then we call self.custom_train() method, the doc model will train itself.

In [ ]:
from gensim.models import Doc2Vec

class DocModel(object):

	def __init__(self, docs, **kwargs):
		"""

		:param docs: list of TaggedDocument
		:param kwargs: dictionary of (key,value) for Doc2Vec arguments
		"""
		self.model = Doc2Vec(**kwargs)
		self.docs = docs
		self.model.build_vocab([x for x in self.docs])

	def custom_train(self, fixed_lr=False, fixed_lr_epochs=None):
		"""
		Train Doc2Vec with two options, without fixed learning rate(recommended) or with fixed learning rate.
		Fixed learning rate also includes implementation of shuffling training dataset.


		:param fixed_lr: boolean
		:param fixed_lr_epochs: num of epochs for fixed lr training
		"""
		if not fixed_lr:
			self.model.train([x for x in self.docs],
					 total_examples=len(self.docs),
					 epochs=self.model.epochs)
		else:
			for _ in range(fixed_lr_epochs):
				self.model.train(utils.shuffle([x for x in self.docs]),
						 total_examples=len(self.docs),
						 epochs=1)
				self.model.alpha -= 0.002
				self.model.min_alpha = self.model.alpha  # fixed learning rate


	def test_orig_doc_infer(self):
		"""
		Use the original doc as input for model's vector inference,
		and then compare using most_similar()
		to see if model finds the original doc id be the most similar doc to the input.
		"""
		idx = np.random.randint(len(self.docs))
		print('idx: ' + str(idx))
		doc = [doc for doc in self.docs if doc.tags[0] == idx]
		inferred_vec = self.model.infer_vector(doc[0].words)
		print(self.model.docvecs.most_similar([inferred_vec]))  # wrap vec in a list
In [ ]:
# Configure keyed arguments for Doc2Vec model.
dm_args = {
    'dm': 1,
    'dm_mean': 1,
    'vector_size': 100,
    'window': 5,
    'negative': 5,
    'hs': 0,
    'min_count': 2,
    'sample': 0,
    'workers': workers,
    'alpha': 0.025,
    'min_alpha': 0.025,
    'epochs': 100,
    'comment': 'alpha=0.025'
}
In [ ]:
from gensim.models.doc2vec import TaggedDocument

doc_ids = np.arange(len(text))
tagdocs = [TaggedDocument(words=words, tags=[tag]) for words, tag in zip(text, doc_ids)]
In [ ]:
# Instantiate a pv-dm model.
dm = DocModel(docs=tagdocs, **dm_args)
In [ ]:
dm.custom_train()
In [ ]:
# Save doc2vec as feature dataframe.
dm_doc_vec_list = []
for i in range(len(dm.model.docvecs)):
    dm_doc_vec_list.append(dm.model.docvecs[i])


dm_doc_vec = pd.DataFrame(dm_doc_vec_list)
print('Shape of dm doc2vec...')
display(dm_doc_vec.shape)

print('Save dm doc2vec as csv file...')
dm_doc_vec.to_csv(os.path.join(project_path , 'Embeddings/dm_doc_vec.csv'), index=False, header=False)
Shape of dm doc2vec...
(8417, 100)
Save dm doc2vec as csv file...

5. Fast Text

We will directly train doc2vec,without doing average all word embeddings. Here we have chosen PV-DM model to train doc2vec. The class just needs to take in the TaggedDocument and then we call self.custom_train() method, the doc model will train itself.

In [ ]:
from gensim.models.fasttext import FastText

# Set values for various parameters
feature_size = 300    # Word vector dimensionality  
window_context = 50          # Context window size                                                                                    
min_word_count = 5   # Minimum word count                        
In [ ]:
# sg decides whether to use the skip-gram model (1) or CBOW (0)
ft_model = FastText(text, size=feature_size, window=window_context, 
                    min_count=min_word_count,sg=1, iter=100)
In [ ]:
# Apply word averaging on Fast Text word vector.
ft_mean_vec_tr = MeanEmbeddingVectorizer(ft_model)
In [ ]:
ft_doc_vec = ft_mean_vec_tr.transform(text)
In [ ]:
# Save Fast Text word averaging doc2vec.
print('Shape of glove-word-mean doc2vec...')
print(ft_doc_vec.shape)

print('Save fastText-word-mean doc2vec as csv file...')
np.savetxt(os.path.join(project_path, 'Embeddings/fast_text_vec.csv'), ft_doc_vec, delimiter=',')
Shape of glove-word-mean doc2vec...
(8417, 300)
Save fastText-word-mean doc2vec as csv file...

Word Embeddings in ML Models

In [ ]:
import os
import pandas as pd

# Read in saved files.
doc_vec = pd.read_csv(os.path.join(project_path, 'Embeddings/doc_vec.csv'), header=None)
tfidf_doc_vec = pd.read_csv(os.path.join(project_path, 'Embeddings/tfidf_doc_vec.csv'), header=None)
glove_doc_vec = pd.read_csv(os.path.join(project_path, 'Embeddings/glove_doc_vec.csv'), header=None)
dm_doc_vec = pd.read_csv(os.path.join(project_path, 'Embeddings/dm_doc_vec.csv'), header=None)
ft_doc_vec = pd.read_csv(os.path.join(project_path, 'Embeddings/fast_text_vec.csv'), header=None)
In [ ]:
df = ft_doc_vec
concate = False
concat_df = dm_doc_vec
In [ ]:
import math
from sklearn.model_selection import train_test_split

def main(model, df, concate, concat_df,target_labels):
    if concate:
        df = pd.concat([pd.DataFrame(df), pd.DataFrame(concat_df)], axis=1, ignore_index=True)
    else:
        df = df

    # Specify train/valid/test size.
    train_size, valid_size, test_size = split_size(df, train=0.8, valid=0.)  # no need to use valid dataset here
    
    # Prepare test dataset.
    train_X, test_X, train_y, test_y = train_test_split(df,
                                                    target_labels,
                                                    test_size=test_size,
                                                    random_state=1)
                                                   

    # Prepare valid dataset.
    if valid_size != 0:
        train_X, valid_X, train_y, valid_y = train_test_split(train_X,
                                                      train_y,
                                                      test_size=valid_size,
                                                      random_state=1,
                                                      stratify=train_y)
    
    print('Shape of train_X: {}'.format(train_X.shape))
    print('Shape of train_Y: {}'.format(train_y.shape))
    print('Shape of valid_X: {}'.format(valid_X.shape if 'valid_X' in vars() else (0,0)))
    print('Shape of text_X: {}'.format(test_X.shape))
    
    # model.fit(train_X, train_y)
    
    if valid_size != 0:
        return model, train_X, valid_X, test_X, train_y, valid_y, test_y
    else:
        return model, train_X, None, test_X, train_y, None, test_y

def split_size(df, train=0.5, valid=0.3):
    train_size = math.floor(len(df) * train)
    valid_size = math.floor(len(df) * valid)
    test_size = len(df) - train_size - valid_size
    return train_size, valid_size, test_size
In [ ]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
models = []

models.append(('SGD Classifier', SGDClassifier(loss='hinge',                   
                    verbose=0,
                    random_state=1,
                    learning_rate='invscaling',
                    eta0=1)))
models.append(('SVM', SVC(C=1.0, kernel='linear', degree=3, gamma='auto')))
In [ ]:
for name, model in models:
  clf, x_train, valid_X, x_test, y_train, valid_y, y_test = main(model, df, concate=False, 
                                                               concat_df=concat_df, target_labels = incidentsData['Assignment group']
                                                              )
  PrintModelPerformanceReport(clf, x_train, y_train, x_test, y_test, name)
In [ ]:
 

**Deep Learning Modeling**

Model Definations

In [17]:
# load the whole embedding into memory
def loadEmbeddingDictionary(embeddingType):
  switcher = { 
        "glove": project_path + 'Embeddings/glove_doc_vec.csv', 
        "original_glove": project_path + 'glove/glove.6B.300d.txt', 
        "doc2vec": project_path + 'Embeddings/dm_doc_vec.csv', 
        "fast_text": project_path + 'Embeddings/fast_text_vec.csv',
        "word2vec-TfIdf": project_path + 'Embeddings/tfidf_doc_vec.csv',
        "word2vec-sim-avg": project_path + 'Embeddings/doc_vec.csv'
    }
  
  file_name = switcher.get(embeddingType, "") 
  embeddings_index = dict()
  f = open(file_name)
  for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
  f.close()
  print('Loaded %s word vectors.' % len(embeddings_index))
  return embeddings_index
In [18]:
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
def getVariables_and_EmbeddingMatrix(doc_words,numWords, embedding_dimension, embeddings_index, oversampling=False):

  tokenizer = Tokenizer(num_words=numWords, lower=True,split=' ', char_level=False)
  tokenizer.fit_on_texts(doc_words)
  sequences = tokenizer.texts_to_sequences(doc_words)
  X = pad_sequences(sequences, maxlen=maxlen)
  labels = to_categorical(np.asarray(incidentsData['Assignment group']))
  if oversampling == True:
      label_encoder = preprocessing.LabelEncoder()       
      labels = to_categorical(np.asarray(
                                      label_encoder.fit_transform(incidentsData_Others_upsample['Assignment group'])))

  embedding_matrix = np.zeros((numWords+1, embedding_dimension))
  for i,word in tokenizer.index_word.items():
    if i<numWords+1:
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
  return X, labels, embedding_matrix
In [19]:
from sklearn.model_selection import train_test_split
def splitData(X,y):
    print("Number of Samples:", len(X))
    print("Number of Labels: ", len(y))
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=0)    
    print("Number of train Samples:", len(x_train))
   
    return x_train, x_test, y_train, y_test
In [20]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))
In [21]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Embedding,Input
import tensorflow as tf
from tensorflow import keras

from keras.models import Sequential, Model
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding,Bidirectional
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Input
import tensorflow as tf
# from tensorflow.keras.utils import plot_model

def get_LSTM_Model(numWords, input_length, output_dimension, weights, additional_mectrics=False):
  model = Sequential(name="LSTMSequential")
  model.add(Embedding(numWords, output_dim=output_dimension, input_length=input_length, weights=[weights], trainable=False,name="Embedding"))
  model.add(Bidirectional(LSTM(128),name="BidirectionalLSTM"))
  model.add(Dropout(0.3, name="Dropout"))
  model.add(Dense(100, activation='relu',name="Dense"))
  model.add(Dense(y_train.shape[1], activation='softmax',name="DenseOutput"))

  optimzer = keras.optimizers.Adam(clipvalue=0.5) # clip value to avoid the gradient exploding

  if additional_mectrics == True:
    model.compile(loss='categorical_crossentropy', optimizer=optimzer, metrics=['acc',f1_m,precision_m, recall_m])
  else:
    model.compile(loss='categorical_crossentropy', optimizer=optimzer,metrics=['acc'])
  
  model.summary()
  # plot_model(model,to_file="LSTM_Model.jpg")
  return model
In [22]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Embedding,Input
import tensorflow as tf
from tensorflow import keras

from keras.models import Sequential, Model
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding,Bidirectional
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Input
import tensorflow as tf

def get_RNN_Model(numWords, input_length, output_dimension, weights, additional_mectrics=False):
        model = Sequential(name="RNNSequential")
        model.add(Embedding(numWords, output_dim=output_dimension, input_length=input_length, weights=[weights], trainable=False,name="Embedding"))
        model.add(Conv1D(100,10,activation='relu',name="Conv1D-1"))
        model.add(MaxPooling1D(pool_size=2,name="MaxPooling1D-1"))
        model.add(Dropout(0.3, name="Dropout-1"))
        model.add(Conv1D(100,10,activation='relu',name="Conv1D-2"))
        model.add(MaxPooling1D(pool_size=2, name="MaxPooling1D"))
        model.add(Bidirectional(LSTM(128), name="Bidirectional_LSTM"))
        model.add(Dropout(0.3, name="Dropout-2"))
        model.add(Dense(100,activation='relu', name="Dense"))
        model.add(Dense(y_train.shape[1], activation='softmax',name="DenseOutput"))

        optimzer = keras.optimizers.Adam(clipvalue=0.5) # clip value to avoid the gradient exploding
        if additional_mectrics == True:
          model.compile(loss='categorical_crossentropy', optimizer=optimzer, metrics=['acc',f1_m,precision_m, recall_m])
        else:
          model.compile(loss='categorical_crossentropy', optimizer=optimzer,metrics=['acc'])
        
        model.summary()
        # plot_model(model,to_file="LSTM_Model.jpg")
        return model
In [23]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Embedding,Input
import tensorflow as tf
from tensorflow import keras

from keras.models import Sequential, Model
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding,Bidirectional
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Input
import tensorflow as tf

def get_GRU_Model(numWords, input_length, output_dimension, weights, additional_mectrics=False):
      model = Sequential(name="GRUSequential")
      model.add(Embedding(numWords, output_dim=output_dimension, input_length=input_length, weights=[weights], trainable=False,name="Embedding"))
      model.add(GRU(128, name="GRU"))
      model.add(Dropout(0.3, name="Dropout"))      
      model.add(Dense(100,activation='relu', name="Dense"))
      model.add(Dense(y_train.shape[1], activation='softmax',name="DenseOutput"))

      optimzer = keras.optimizers.Adam(clipvalue=0.5) # clip value to avoid the gradient exploding
      if additional_mectrics == True:
        model.compile(loss='categorical_crossentropy', optimizer=optimzer, metrics=['acc',f1_m,precision_m, recall_m])
      else:
        model.compile(loss='categorical_crossentropy', optimizer=optimzer,metrics=['acc'])
      
      model.summary()
      # plot_model(model,to_file="LSTM_Model.jpg")
      return model
In [24]:
def EvaluateModel(model, modelName, x_test, y_test, train_accuracy,additional_mectrics=False):
  if additional_mectrics == True:
    loss, accuracy, f1_score, precision, recall = model.evaluate(x_test, y_test, verbose=0)
    update_model_score(modelName,train_accuracy,accuracy,precision,recall,f1_score)
  else:
     loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
     update_model_score(modelName,train_accuracy,accuracy,'-','-','-')
In [25]:
 import matplotlib.pyplot as plt
 def plot_ModelAccuracy_Loss(history, modelname):
    plt.figure(figsize=(12,10))
    plt.subplot(2,2,1)
    plt.plot(history.history['acc'],color='#00A0B8')
    plt.plot(history.history['val_acc'],color='coral')

    plt.title(modelname+' model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train','test'], loc='upper left')
   

    plt.subplot(2,2,2)
    plt.plot(history.history['loss'],color='#00A0B8')
    plt.plot(history.history['val_loss'],color='coral')

    plt.title(modelname+' model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train','test'], loc='upper left')
    plt.show()
In [26]:
def TrainModel(model, modelname, x_train, y_train, epochs, batch_size, verbose):
  history = model.fit(x_train, y_train, epochs=epochs,batch_size=batch_size, validation_split=0.2, verbose=verbose)
  plot_ModelAccuracy_Loss(history, modelname=modelname)
  return np.mean(history.history['acc'])
In [29]:
doc_words=[]
for i in text:
    li = list(i.split(" "))
    doc_words.append(li)

1.LSTM Model

In [ ]:
incidentsData = inc_dataLoader.encodeAssignmentGroup(incidentsData)

Word2Vec with Simple Averaging

In [ ]:
embeddings_index = loadEmbeddingDictionary('word2vec-sim-avg')
Loaded 7666 word vectors.
In [ ]:
maxlen = 300
numWords=8417
epochs = 10
modelname = 'LSTM with Word2Vec(Simple Averaging)'
In [ ]:
X, labels, embedding_matrix = getVariables_and_EmbeddingMatrix(doc_words,numWords, 300, embeddings_index)
In [ ]:
x_train, x_test, y_train, y_test = splitData(X, labels)
Number of Samples: 8417
Number of Labels:  8417
Number of train Samples: 6733
In [ ]:
model = get_LSTM_Model(embedding_matrix.shape[0], 300, 300,embedding_matrix)
Model: "LSTMSequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
Embedding (Embedding)        (None, 300, 300)          2525400   
_________________________________________________________________
BidirectionalLSTM (Bidirecti (None, 256)               439296    
_________________________________________________________________
Dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
Dense (Dense)                (None, 100)               25700     
_________________________________________________________________
DenseOutput (Dense)          (None, 74)                7474      
=================================================================
Total params: 2,997,870
Trainable params: 472,470
Non-trainable params: 2,525,400
_________________________________________________________________
In [ ]:
train_acc = TrainModel(model, modelname,x_train,y_train, epochs=10,batch_size=100, verbose=2)
Epoch 1/10
54/54 - 13s - loss: 4.2742 - acc: 0.4434 - val_loss: 4.2438 - val_acc: 0.4521
Epoch 2/10
54/54 - 3s - loss: 4.2137 - acc: 0.4664 - val_loss: 4.1852 - val_acc: 0.4521
Epoch 3/10
54/54 - 3s - loss: 4.1548 - acc: 0.4664 - val_loss: 4.1279 - val_acc: 0.4521
Epoch 4/10
54/54 - 3s - loss: 4.0974 - acc: 0.4664 - val_loss: 4.0722 - val_acc: 0.4521
Epoch 5/10
54/54 - 3s - loss: 4.0414 - acc: 0.4664 - val_loss: 4.0179 - val_acc: 0.4521
Epoch 6/10
54/54 - 3s - loss: 3.9869 - acc: 0.4664 - val_loss: 3.9651 - val_acc: 0.4521
Epoch 7/10
54/54 - 3s - loss: 3.9339 - acc: 0.4664 - val_loss: 3.9139 - val_acc: 0.4521
Epoch 8/10
54/54 - 3s - loss: 3.8821 - acc: 0.4664 - val_loss: 3.8639 - val_acc: 0.4521
Epoch 9/10
54/54 - 3s - loss: 3.8317 - acc: 0.4664 - val_loss: 3.8153 - val_acc: 0.4521
Epoch 10/10
54/54 - 3s - loss: 3.7827 - acc: 0.4664 - val_loss: 3.7678 - val_acc: 0.4521
In [ ]:
EvaluateModel(model, modelname, x_test, y_test, train_acc)

Word2Vec with TF-IDF Weighted Averaging

In [ ]:
embeddings_index = loadEmbeddingDictionary('word2vec-TfIdf')
Loaded 7666 word vectors.
In [ ]:
maxlen = 300
numWords=8417
epochs = 10
modelname = 'LSTM with Word2Vec(TF-IDF Weighted Averaging)'
In [ ]:
X, labels, embedding_matrix = getVariables_and_EmbeddingMatrix(doc_words,numWords, 300, embeddings_index)
In [ ]:
x_train, x_test, y_train, y_test = splitData(X, labels)
Number of Samples: 8417
Number of Labels:  8417
Number of train Samples: 6733
In [ ]:
model = get_LSTM_Model(embedding_matrix.shape[0], 300, 300,embedding_matrix)
Model: "LSTMSequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
Embedding (Embedding)        (None, 300, 300)          2525400   
_________________________________________________________________
BidirectionalLSTM (Bidirecti (None, 256)               439296    
_________________________________________________________________
Dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
Dense (Dense)                (None, 100)               25700     
_________________________________________________________________
DenseOutput (Dense)          (None, 74)                7474      
=================================================================
Total params: 2,997,870
Trainable params: 472,470
Non-trainable params: 2,525,400
_________________________________________________________________
In [ ]:
train_acc = TrainModel(model, modelname,x_train,y_train, epochs=10,batch_size=100, verbose=2)
Epoch 1/10
54/54 - 7s - loss: 4.2741 - acc: 0.4489 - val_loss: 4.2436 - val_acc: 0.4521
Epoch 2/10
54/54 - 3s - loss: 4.2137 - acc: 0.4664 - val_loss: 4.1851 - val_acc: 0.4521
Epoch 3/10
54/54 - 3s - loss: 4.1547 - acc: 0.4664 - val_loss: 4.1277 - val_acc: 0.4521
Epoch 4/10
54/54 - 3s - loss: 4.0972 - acc: 0.4664 - val_loss: 4.0721 - val_acc: 0.4521
Epoch 5/10
54/54 - 3s - loss: 4.0411 - acc: 0.4664 - val_loss: 4.0177 - val_acc: 0.4521
Epoch 6/10
54/54 - 3s - loss: 3.9867 - acc: 0.4664 - val_loss: 3.9648 - val_acc: 0.4521
Epoch 7/10
54/54 - 3s - loss: 3.9335 - acc: 0.4664 - val_loss: 3.9136 - val_acc: 0.4521
Epoch 8/10
54/54 - 3s - loss: 3.8818 - acc: 0.4664 - val_loss: 3.8635 - val_acc: 0.4521
Epoch 9/10
54/54 - 3s - loss: 3.8314 - acc: 0.4664 - val_loss: 3.8148 - val_acc: 0.4521
Epoch 10/10
54/54 - 4s - loss: 3.7823 - acc: 0.4664 - val_loss: 3.7673 - val_acc: 0.4521
In [ ]:
EvaluateModel(model, modelname, x_test, y_test, train_acc)
In [ ]:
del model, embeddings_index, embedding_matrix, X

Doc2Vec

In [ ]:
embeddings_index = loadEmbeddingDictionary('doc2vec')
Loaded 8417 word vectors.
In [ ]:
maxlen = 300
numWords=8417
epochs = 10
modelname = 'LSTM with Doc2Vec'
In [ ]:
X, labels, embedding_matrix = getVariables_and_EmbeddingMatrix(doc_words,numWords, 300, embeddings_index)
In [ ]:
x_train, x_test, y_train, y_test = splitData(X, labels)
Number of Samples: 8417
Number of Labels:  8417
Number of train Samples: 6733
In [ ]:
model = get_LSTM_Model(embedding_matrix.shape[0], 300, 300,embedding_matrix)
Model: "LSTMSequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
Embedding (Embedding)        (None, 300, 300)          2525400   
_________________________________________________________________
BidirectionalLSTM (Bidirecti (None, 256)               439296    
_________________________________________________________________
Dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
Dense (Dense)                (None, 100)               25700     
_________________________________________________________________
DenseOutput (Dense)          (None, 74)                7474      
=================================================================
Total params: 2,997,870
Trainable params: 472,470
Non-trainable params: 2,525,400
_________________________________________________________________
In [ ]:
train_acc = TrainModel(model, modelname,x_train,y_train, epochs=10,batch_size=100, verbose=2)
Epoch 1/10
54/54 - 6s - loss: 4.2741 - acc: 0.4664 - val_loss: 4.2437 - val_acc: 0.4521
Epoch 2/10
54/54 - 3s - loss: 4.2135 - acc: 0.4664 - val_loss: 4.1849 - val_acc: 0.4521
Epoch 3/10
54/54 - 3s - loss: 4.1546 - acc: 0.4664 - val_loss: 4.1274 - val_acc: 0.4521
Epoch 4/10
54/54 - 4s - loss: 4.0972 - acc: 0.4664 - val_loss: 4.0719 - val_acc: 0.4521
Epoch 5/10
54/54 - 3s - loss: 4.0412 - acc: 0.4664 - val_loss: 4.0178 - val_acc: 0.4521
Epoch 6/10
54/54 - 3s - loss: 3.9866 - acc: 0.4664 - val_loss: 3.9649 - val_acc: 0.4521
Epoch 7/10
54/54 - 3s - loss: 3.9335 - acc: 0.4664 - val_loss: 3.9133 - val_acc: 0.4521
Epoch 8/10
54/54 - 3s - loss: 3.8818 - acc: 0.4664 - val_loss: 3.8633 - val_acc: 0.4521
Epoch 9/10
54/54 - 3s - loss: 3.8314 - acc: 0.4664 - val_loss: 3.8148 - val_acc: 0.4521
Epoch 10/10
54/54 - 3s - loss: 3.7822 - acc: 0.4664 - val_loss: 3.7673 - val_acc: 0.4521
In [ ]:
EvaluateModel(model, modelname, x_test, y_test, train_acc)
In [ ]:
del model, embeddings_index, embedding_matrix, X

Fast Text

In [ ]:
maxlen = 300
numWords=8417
epochs = 10
modelname = 'LSTM with FastText'
In [ ]:
from gensim.models.fasttext import FastText
ft = FastText(size=300, window=50, min_count=1, sg=1)
ft.build_vocab(text)
ft.train(text, total_examples=ft.corpus_count, epochs=10)
In [ ]:
ft.corpus_count
Out[ ]:
8417
In [ ]:
bigger_list=[]
for i in text:
    li = list(i.split(" "))
    bigger_list.append(li) 

tokenizer = Tokenizer(num_words=numWords, lower=True,split=' ', char_level=False)
tokenizer.fit_on_texts(bigger_list)
sequences = tokenizer.texts_to_sequences(bigger_list)
X = pad_sequences(sequences, maxlen=maxlen)
labels = to_categorical(np.asarray(incidentsData['Assignment group']))
In [ ]:
embedding_matrix_ft = np.random.random((len(tokenizer.word_index) + 1, ft.vector_size))
pas = 0
for word,i in tokenizer.word_index.items():    
    try:
        embedding_matrix_ft[i] = ft.wv[word]
    except:
        pas+=1
In [ ]:
x_train, x_test, y_train, y_test = splitData(X, labels)
Number of Samples: 8417
Number of Labels:  8417
Number of train Samples: 6733
In [ ]:
model = get_LSTM_Model(len(tokenizer.word_index) + 1, 300, 300, embedding_matrix_ft,additional_mectrics=True)
Model: "LSTMSequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
Embedding (Embedding)        (None, 300, 300)          6189900   
_________________________________________________________________
BidirectionalLSTM (Bidirecti (None, 256)               439296    
_________________________________________________________________
Dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
Dense (Dense)                (None, 100)               25700     
_________________________________________________________________
DenseOutput (Dense)          (None, 74)                7474      
=================================================================
Total params: 6,662,370
Trainable params: 472,470
Non-trainable params: 6,189,900
_________________________________________________________________
In [ ]:
train_acc = TrainModel(model, modelname,x_train,y_train, epochs=10,batch_size=100, verbose=2)
Epoch 1/10
54/54 - 7s - loss: 2.7223 - acc: 0.4590 - f1_m: 0.2690 - precision_m: 0.5284 - recall_m: 0.1970 - val_loss: 2.4952 - val_acc: 0.4521 - val_f1_m: 0.3867 - val_precision_m: 0.6358 - val_recall_m: 0.2788
Epoch 2/10
54/54 - 3s - loss: 2.3807 - acc: 0.5084 - f1_m: 0.4239 - precision_m: 0.6751 - recall_m: 0.3139 - val_loss: 2.3227 - val_acc: 0.5100 - val_f1_m: 0.3877 - val_precision_m: 0.6777 - val_recall_m: 0.2723
Epoch 3/10
54/54 - 4s - loss: 2.2632 - acc: 0.5204 - f1_m: 0.4323 - precision_m: 0.6917 - recall_m: 0.3224 - val_loss: 2.2197 - val_acc: 0.5137 - val_f1_m: 0.4776 - val_precision_m: 0.6217 - val_recall_m: 0.3885
Epoch 4/10
54/54 - 3s - loss: 2.1708 - acc: 0.5282 - f1_m: 0.4736 - precision_m: 0.7066 - recall_m: 0.3598 - val_loss: 2.1599 - val_acc: 0.5189 - val_f1_m: 0.4437 - val_precision_m: 0.7010 - val_recall_m: 0.3254
Epoch 5/10
54/54 - 3s - loss: 2.1226 - acc: 0.5319 - f1_m: 0.4761 - precision_m: 0.7144 - recall_m: 0.3613 - val_loss: 2.1363 - val_acc: 0.5122 - val_f1_m: 0.4246 - val_precision_m: 0.7404 - val_recall_m: 0.2988
Epoch 6/10
54/54 - 3s - loss: 2.0660 - acc: 0.5330 - f1_m: 0.4888 - precision_m: 0.7259 - recall_m: 0.3735 - val_loss: 2.1166 - val_acc: 0.5152 - val_f1_m: 0.4335 - val_precision_m: 0.7047 - val_recall_m: 0.3138
Epoch 7/10
54/54 - 3s - loss: 2.0422 - acc: 0.5369 - f1_m: 0.4850 - precision_m: 0.7384 - recall_m: 0.3695 - val_loss: 2.0913 - val_acc: 0.5219 - val_f1_m: 0.4242 - val_precision_m: 0.7363 - val_recall_m: 0.2988
Epoch 8/10
54/54 - 3s - loss: 2.0005 - acc: 0.5446 - f1_m: 0.4843 - precision_m: 0.7616 - recall_m: 0.3638 - val_loss: 2.1012 - val_acc: 0.5226 - val_f1_m: 0.5003 - val_precision_m: 0.6392 - val_recall_m: 0.4117
Epoch 9/10
54/54 - 3s - loss: 1.9551 - acc: 0.5518 - f1_m: 0.5101 - precision_m: 0.7561 - recall_m: 0.3913 - val_loss: 2.0085 - val_acc: 0.5323 - val_f1_m: 0.4645 - val_precision_m: 0.7487 - val_recall_m: 0.3381
Epoch 10/10
54/54 - 3s - loss: 1.9026 - acc: 0.5590 - f1_m: 0.5205 - precision_m: 0.7737 - recall_m: 0.3986 - val_loss: 1.9714 - val_acc: 0.5278 - val_f1_m: 0.4462 - val_precision_m: 0.8220 - val_recall_m: 0.3074
In [ ]:
EvaluateModel(model, modelname, x_test, y_test, train_acc, additional_mectrics=True)
In [ ]:
del ft,model, embedding_matrix, X
In [ ]:
save_model_result()

Glove

In [ ]:
embeddings_index = loadEmbeddingDictionary('original_glove')
Loaded 400000 word vectors.
In [ ]:
maxlen = 300
numWords=8417
epochs = 10
modelname = 'LSTM with Glove'
In [ ]:
X, labels, embedding_matrix = getVariables_and_EmbeddingMatrix(doc_words,numWords,300,embeddings_index)
In [ ]:
embedding_matrix.shape
Out[ ]:
(8418, 300)
In [ ]:
x_train, x_test, y_train, y_test = splitData(X, labels)
Number of Samples: 8417
Number of Labels:  8417
Number of train Samples: 6733
In [ ]:
model = get_LSTM_Model(embedding_matrix.shape[0],300, 300, embedding_matrix,additional_mectrics=True)
Model: "LSTMSequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
Embedding (Embedding)        (None, 300, 300)          2525400   
_________________________________________________________________
BidirectionalLSTM (Bidirecti (None, 256)               439296    
_________________________________________________________________
Dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
Dense (Dense)                (None, 100)               25700     
_________________________________________________________________
DenseOutput (Dense)          (None, 74)                7474      
=================================================================
Total params: 2,997,870
Trainable params: 472,470
Non-trainable params: 2,525,400
_________________________________________________________________
In [ ]:
train_acc = TrainModel(model, modelname,x_train,y_train, epochs=10,batch_size=100, verbose=2)
Epoch 1/10
54/54 - 7s - loss: 2.6777 - acc: 0.4944 - f1_m: 0.3588 - precision_m: 0.5355 - recall_m: 0.2775 - val_loss: 2.1403 - val_acc: 0.5197 - val_f1_m: 0.4926 - val_precision_m: 0.7483 - val_recall_m: 0.3678
Epoch 2/10
54/54 - 3s - loss: 1.9754 - acc: 0.5498 - f1_m: 0.5320 - precision_m: 0.7616 - recall_m: 0.4127 - val_loss: 1.9285 - val_acc: 0.5449 - val_f1_m: 0.5131 - val_precision_m: 0.7381 - val_recall_m: 0.3939
Epoch 3/10
54/54 - 3s - loss: 1.7739 - acc: 0.5823 - f1_m: 0.5625 - precision_m: 0.7798 - recall_m: 0.4415 - val_loss: 1.7916 - val_acc: 0.5494 - val_f1_m: 0.5308 - val_precision_m: 0.7437 - val_recall_m: 0.4132
Epoch 4/10
54/54 - 3s - loss: 1.6260 - acc: 0.5999 - f1_m: 0.5926 - precision_m: 0.7999 - recall_m: 0.4725 - val_loss: 1.6860 - val_acc: 0.5828 - val_f1_m: 0.5326 - val_precision_m: 0.8075 - val_recall_m: 0.3981
Epoch 5/10
54/54 - 3s - loss: 1.5053 - acc: 0.6194 - f1_m: 0.6103 - precision_m: 0.8202 - recall_m: 0.4878 - val_loss: 1.6380 - val_acc: 0.5768 - val_f1_m: 0.5474 - val_precision_m: 0.8261 - val_recall_m: 0.4106
Epoch 6/10
54/54 - 3s - loss: 1.3736 - acc: 0.6394 - f1_m: 0.6434 - precision_m: 0.8364 - recall_m: 0.5254 - val_loss: 1.5573 - val_acc: 0.5917 - val_f1_m: 0.5848 - val_precision_m: 0.7638 - val_recall_m: 0.4747
Epoch 7/10
54/54 - 3s - loss: 1.2839 - acc: 0.6621 - f1_m: 0.6528 - precision_m: 0.8343 - recall_m: 0.5374 - val_loss: 1.4942 - val_acc: 0.5895 - val_f1_m: 0.5947 - val_precision_m: 0.8174 - val_recall_m: 0.4683
Epoch 8/10
54/54 - 3s - loss: 1.1797 - acc: 0.6836 - f1_m: 0.6691 - precision_m: 0.8508 - recall_m: 0.5527 - val_loss: 1.4433 - val_acc: 0.6080 - val_f1_m: 0.6176 - val_precision_m: 0.7752 - val_recall_m: 0.5135
Epoch 9/10
54/54 - 3s - loss: 1.0802 - acc: 0.7087 - f1_m: 0.7030 - precision_m: 0.8568 - recall_m: 0.5973 - val_loss: 1.4668 - val_acc: 0.6214 - val_f1_m: 0.6203 - val_precision_m: 0.7704 - val_recall_m: 0.5194
Epoch 10/10
54/54 - 3s - loss: 1.0049 - acc: 0.7273 - f1_m: 0.7170 - precision_m: 0.8637 - recall_m: 0.6140 - val_loss: 1.5270 - val_acc: 0.6184 - val_f1_m: 0.6191 - val_precision_m: 0.7296 - val_recall_m: 0.5380
In [ ]:
EvaluateModel(model, modelname, x_test, y_test,train_acc,additional_mectrics=True)
In [ ]:
del model, embeddings_index, embedding_matrix, X
In [ ]:
resultsDf
Out[ ]:
Model Train_Acc Test_Acc Precision Recall F1_Score
0 LSTM with Glove 0.922733 0.969869 0.972889 0.966922 0.969849

Glove with Over sampling

In [ ]:
embeddings_index = loadEmbeddingDictionary('original_glove')
maxlen = 300
numWords=8417
epochs = 10
modelname = 'LSTM with Glove with OverSampling'

X, labels, embedding_matrix = getVariables_and_EmbeddingMatrix(doc_words_up_sample, numWords,300,embeddings_index, oversampling=True)

x_train, x_test, y_train, y_test = splitData(X, labels)

model = get_LSTM_Model(embedding_matrix.shape[0],300, 300, embedding_matrix,additional_mectrics=True)
Loaded 400000 word vectors.
Number of Samples: 147525
Number of Labels:  147525
Number of train Samples: 118020
Model: "LSTMSequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
Embedding (Embedding)        (None, 300, 300)          2525400   
_________________________________________________________________
BidirectionalLSTM (Bidirecti (None, 256)               439296    
_________________________________________________________________
Dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
Dense (Dense)                (None, 100)               25700     
_________________________________________________________________
DenseOutput (Dense)          (None, 74)                7474      
=================================================================
Total params: 2,997,870
Trainable params: 472,470
Non-trainable params: 2,525,400
_________________________________________________________________
In [ ]:
train_acc = TrainModel(model, modelname,x_train,y_train, epochs=10,batch_size=100, verbose=2)
EvaluateModel(model, modelname, x_test, y_test,train_acc,additional_mectrics=True)
In [ ]:
y_train.shape
Out[ ]:
(118020, 74)
In [ ]:
del model, embeddings_index, embedding_matrix, X

Glove with SMOTE

Since we are having 73 classes and 8417 samples in the training set. so on average each sample should have atleast 115 samples per class. However, our dataset is extremely imbalanced. There are classes with only one samples and the number of neighbors is 6. So, we cannot use only Synthetic Minority Oversampling Technique (SMOTE). so we will use Random oversampling and then apply SMOTE

In [ ]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
In [ ]:
embeddings_index = loadEmbeddingDictionary('original_glove')
maxlen = 300
numWords=8417
epochs = 10
modelname = 'LSTM with Glove with SMOTE'
Loaded 400000 word vectors.
In [ ]:
X, labels, embedding_matrix = getVariables_and_EmbeddingMatrix(doc_words, numWords,300,embeddings_index, oversampling=False)

ros = RandomOverSampler(random_state=777)
X_ROS, y_ROS = ros.fit_sample(X, labels)

smote = SMOTE(random_state=777,k_neighbors=5)
X, y = smote.fit_sample(X_ROS, y_ROS)
In [ ]:
x_train, x_test, y_train, y_test = splitData(X, y)
model = get_LSTM_Model(embedding_matrix.shape[0],300, 300, embedding_matrix,additional_mectrics=True)
Number of Samples: 291116
Number of Labels:  291116
Number of train Samples: 232892
Model: "LSTMSequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
Embedding (Embedding)        (None, 300, 300)          2525400   
_________________________________________________________________
BidirectionalLSTM (Bidirecti (None, 256)               439296    
_________________________________________________________________
Dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
Dense (Dense)                (None, 100)               25700     
_________________________________________________________________
DenseOutput (Dense)          (None, 74)                7474      
=================================================================
Total params: 2,997,870
Trainable params: 472,470
Non-trainable params: 2,525,400
_________________________________________________________________
In [ ]:
train_acc = TrainModel(model, modelname,x_train,y_train, epochs=10,batch_size=100, verbose=2)
EvaluateModel(model, modelname, x_test, y_test,train_acc,additional_mectrics=True)
Epoch 1/10
1864/1864 - 125s - loss: 0.7502 - acc: 0.8016 - f1_m: 0.7887 - precision_m: 0.9251 - recall_m: 0.7143 - val_loss: 0.2088 - val_acc: 0.9370 - val_f1_m: 0.9384 - val_precision_m: 0.9671 - val_recall_m: 0.9117
Epoch 2/10
1864/1864 - 115s - loss: 0.1948 - acc: 0.9407 - f1_m: 0.9418 - precision_m: 0.9618 - recall_m: 0.9230 - val_loss: 0.1213 - val_acc: 0.9638 - val_f1_m: 0.9635 - val_precision_m: 0.9696 - val_recall_m: 0.9576
Epoch 3/10
1864/1864 - 116s - loss: 0.1278 - acc: 0.9594 - f1_m: 0.9601 - precision_m: 0.9713 - recall_m: 0.9492 - val_loss: 0.1017 - val_acc: 0.9677 - val_f1_m: 0.9681 - val_precision_m: 0.9738 - val_recall_m: 0.9626
Epoch 4/10
1864/1864 - 115s - loss: 0.1016 - acc: 0.9670 - f1_m: 0.9675 - precision_m: 0.9761 - recall_m: 0.9591 - val_loss: 0.0737 - val_acc: 0.9752 - val_f1_m: 0.9754 - val_precision_m: 0.9780 - val_recall_m: 0.9728
Epoch 5/10
1864/1864 - 115s - loss: 0.0829 - acc: 0.9721 - f1_m: 0.9725 - precision_m: 0.9795 - recall_m: 0.9657 - val_loss: 0.0621 - val_acc: 0.9778 - val_f1_m: 0.9785 - val_precision_m: 0.9885 - val_recall_m: 0.9688
Epoch 6/10
1864/1864 - 115s - loss: 0.0731 - acc: 0.9751 - f1_m: 0.9757 - precision_m: 0.9817 - recall_m: 0.9698 - val_loss: 0.0603 - val_acc: 0.9788 - val_f1_m: 0.9800 - val_precision_m: 0.9895 - val_recall_m: 0.9708
Epoch 7/10
1864/1864 - 115s - loss: 0.0711 - acc: 0.9757 - f1_m: 0.9762 - precision_m: 0.9823 - recall_m: 0.9703 - val_loss: 0.0530 - val_acc: 0.9818 - val_f1_m: 0.9818 - val_precision_m: 0.9824 - val_recall_m: 0.9813
Epoch 8/10
1864/1864 - 115s - loss: 0.0578 - acc: 0.9794 - f1_m: 0.9797 - precision_m: 0.9848 - recall_m: 0.9747 - val_loss: 0.0492 - val_acc: 0.9823 - val_f1_m: 0.9824 - val_precision_m: 0.9831 - val_recall_m: 0.9818
Epoch 9/10
1864/1864 - 115s - loss: 0.0568 - acc: 0.9797 - f1_m: 0.9800 - precision_m: 0.9852 - recall_m: 0.9750 - val_loss: 0.0486 - val_acc: 0.9830 - val_f1_m: 0.9830 - val_precision_m: 0.9836 - val_recall_m: 0.9825
Epoch 10/10
1864/1864 - 115s - loss: 0.0551 - acc: 0.9804 - f1_m: 0.9808 - precision_m: 0.9864 - recall_m: 0.9753 - val_loss: 0.0498 - val_acc: 0.9823 - val_f1_m: 0.9820 - val_precision_m: 0.9831 - val_recall_m: 0.9809

2.RNN Model

In [ ]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
In [ ]:
embeddings_index = loadEmbeddingDictionary('original_glove')
maxlen = 300
numWords=8417
epochs = 10
modelname = 'RNN with Glove with SMOTE'
In [ ]:
X, labels, embedding_matrix = getVariables_and_EmbeddingMatrix(doc_words, numWords,300,embeddings_index, oversampling=False)
In [ ]:
ros = RandomOverSampler(random_state=777)
X_ROS, y_ROS = ros.fit_sample(X, labels)

smote = SMOTE(random_state=777,k_neighbors=5)
X, y = smote.fit_sample(X_ROS, y_ROS)
In [ ]:
x_train, x_test, y_train, y_test = splitData(X, y)
model = get_RNN_Model(embedding_matrix.shape[0],300, 300, embedding_matrix,additional_mectrics=True)
Number of Samples: 291116
Number of Labels:  291116
Number of train Samples: 232892
Model: "RNNSequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
Embedding (Embedding)        (None, 300, 300)          2525400   
_________________________________________________________________
Conv1D-1 (Conv1D)            (None, 291, 100)          300100    
_________________________________________________________________
MaxPooling1D-1 (MaxPooling1D (None, 145, 100)          0         
_________________________________________________________________
Dropout-1 (Dropout)          (None, 145, 100)          0         
_________________________________________________________________
Conv1D-2 (Conv1D)            (None, 136, 100)          100100    
_________________________________________________________________
MaxPooling1D (MaxPooling1D)  (None, 68, 100)           0         
_________________________________________________________________
Bidirectional_LSTM (Bidirect (None, 256)               234496    
_________________________________________________________________
Dropout-2 (Dropout)          (None, 256)               0         
_________________________________________________________________
Dense (Dense)                (None, 100)               25700     
_________________________________________________________________
DenseOutput (Dense)          (None, 74)                7474      
=================================================================
Total params: 3,193,270
Trainable params: 667,870
Non-trainable params: 2,525,400
_________________________________________________________________
In [ ]:
train_acc = TrainModel(model, modelname,x_train,y_train, epochs=10,batch_size=100, verbose=2)
EvaluateModel(model, modelname, x_test, y_test,train_acc,additional_mectrics=True)
Epoch 1/10
1864/1864 - 69s - loss: 0.1782 - acc: 0.9420 - f1_m: 0.9455 - precision_m: 0.9670 - recall_m: 0.9251 - val_loss: 0.1142 - val_acc: 0.9603 - val_f1_m: 0.9650 - val_precision_m: 0.9813 - val_recall_m: 0.9494
Epoch 2/10
1864/1864 - 68s - loss: 0.1373 - acc: 0.9532 - f1_m: 0.9567 - precision_m: 0.9741 - recall_m: 0.9401 - val_loss: 0.0959 - val_acc: 0.9664 - val_f1_m: 0.9699 - val_precision_m: 0.9848 - val_recall_m: 0.9556
Epoch 3/10
1864/1864 - 67s - loss: 0.1215 - acc: 0.9585 - f1_m: 0.9614 - precision_m: 0.9770 - recall_m: 0.9464 - val_loss: 0.0955 - val_acc: 0.9668 - val_f1_m: 0.9701 - val_precision_m: 0.9866 - val_recall_m: 0.9543
Epoch 4/10
1864/1864 - 67s - loss: 0.1106 - acc: 0.9610 - f1_m: 0.9645 - precision_m: 0.9800 - recall_m: 0.9496 - val_loss: 0.0920 - val_acc: 0.9691 - val_f1_m: 0.9726 - val_precision_m: 0.9886 - val_recall_m: 0.9573
Epoch 5/10
1864/1864 - 67s - loss: 0.1036 - acc: 0.9632 - f1_m: 0.9667 - precision_m: 0.9817 - recall_m: 0.9523 - val_loss: 0.0840 - val_acc: 0.9708 - val_f1_m: 0.9740 - val_precision_m: 0.9880 - val_recall_m: 0.9606
Epoch 6/10
1864/1864 - 67s - loss: 0.0988 - acc: 0.9650 - f1_m: 0.9684 - precision_m: 0.9832 - recall_m: 0.9542 - val_loss: 0.0746 - val_acc: 0.9728 - val_f1_m: 0.9758 - val_precision_m: 0.9897 - val_recall_m: 0.9624
Epoch 7/10
1864/1864 - 67s - loss: 0.0950 - acc: 0.9659 - f1_m: 0.9693 - precision_m: 0.9842 - recall_m: 0.9551 - val_loss: 0.0738 - val_acc: 0.9738 - val_f1_m: 0.9770 - val_precision_m: 0.9909 - val_recall_m: 0.9636
Epoch 8/10
1864/1864 - 67s - loss: 0.0889 - acc: 0.9677 - f1_m: 0.9714 - precision_m: 0.9861 - recall_m: 0.9573 - val_loss: 0.0765 - val_acc: 0.9718 - val_f1_m: 0.9751 - val_precision_m: 0.9884 - val_recall_m: 0.9623
Epoch 9/10
1864/1864 - 67s - loss: 0.0884 - acc: 0.9681 - f1_m: 0.9714 - precision_m: 0.9857 - recall_m: 0.9575 - val_loss: 0.0723 - val_acc: 0.9739 - val_f1_m: 0.9770 - val_precision_m: 0.9903 - val_recall_m: 0.9641
Epoch 10/10
1864/1864 - 67s - loss: 0.0859 - acc: 0.9691 - f1_m: 0.9724 - precision_m: 0.9868 - recall_m: 0.9586 - val_loss: 0.0707 - val_acc: 0.9739 - val_f1_m: 0.9775 - val_precision_m: 0.9910 - val_recall_m: 0.9644
In [ ]:
del model, X

3.GRU Model

In [ ]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
In [ ]:
embeddings_index = loadEmbeddingDictionary('original_glove')
maxlen = 300
numWords=8417
epochs = 10
modelname = 'GRU with Glove with SMOTE'
In [ ]:
X, labels, embedding_matrix = getVariables_and_EmbeddingMatrix(doc_words, numWords,300,embeddings_index, oversampling=False)
In [ ]:
ros = RandomOverSampler(random_state=777)
X_ROS, y_ROS = ros.fit_sample(X, labels)
In [ ]:
smote = SMOTE(random_state=777,k_neighbors=5)
X, y = smote.fit_sample(X_ROS, y_ROS)
In [ ]:
x_train, x_test, y_train, y_test = splitData(X, y)
model = get_LSTM_Model(embedding_matrix.shape[0],300, 300, embedding_matrix,additional_mectrics=True)
Number of Samples: 291116
Number of Labels:  291116
Number of train Samples: 232892
Model: "LSTMSequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
Embedding (Embedding)        (None, 300, 300)          2525400   
_________________________________________________________________
BidirectionalLSTM (Bidirecti (None, 256)               439296    
_________________________________________________________________
Dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
Dense (Dense)                (None, 100)               25700     
_________________________________________________________________
DenseOutput (Dense)          (None, 74)                7474      
=================================================================
Total params: 2,997,870
Trainable params: 472,470
Non-trainable params: 2,525,400
_________________________________________________________________
In [ ]:
train_acc = TrainModel(model, modelname,x_train,y_train, epochs=10,batch_size=100, verbose=2)
EvaluateModel(model, modelname, x_test, y_test,train_acc,additional_mectrics=True)
Epoch 1/10
1864/1864 - 125s - loss: 0.7420 - acc: 0.8029 - f1_m: 0.7918 - precision_m: 0.9255 - recall_m: 0.7176 - val_loss: 0.2249 - val_acc: 0.9326 - val_f1_m: 0.9332 - val_precision_m: 0.9651 - val_recall_m: 0.9035
Epoch 2/10
1864/1864 - 121s - loss: 0.1894 - acc: 0.9420 - f1_m: 0.9430 - precision_m: 0.9631 - recall_m: 0.9239 - val_loss: 0.1197 - val_acc: 0.9618 - val_f1_m: 0.9633 - val_precision_m: 0.9797 - val_recall_m: 0.9477
Epoch 3/10
1864/1864 - 121s - loss: 0.1281 - acc: 0.9590 - f1_m: 0.9598 - precision_m: 0.9706 - recall_m: 0.9493 - val_loss: 0.0850 - val_acc: 0.9718 - val_f1_m: 0.9727 - val_precision_m: 0.9845 - val_recall_m: 0.9613
Epoch 4/10
1864/1864 - 121s - loss: 0.0982 - acc: 0.9674 - f1_m: 0.9679 - precision_m: 0.9759 - recall_m: 0.9600 - val_loss: 0.0691 - val_acc: 0.9742 - val_f1_m: 0.9756 - val_precision_m: 0.9868 - val_recall_m: 0.9647
Epoch 5/10
1864/1864 - 120s - loss: 0.0880 - acc: 0.9707 - f1_m: 0.9714 - precision_m: 0.9785 - recall_m: 0.9643 - val_loss: 0.0673 - val_acc: 0.9765 - val_f1_m: 0.9775 - val_precision_m: 0.9875 - val_recall_m: 0.9678
Epoch 6/10
1864/1864 - 121s - loss: 0.0743 - acc: 0.9744 - f1_m: 0.9749 - precision_m: 0.9811 - recall_m: 0.9688 - val_loss: 0.0551 - val_acc: 0.9802 - val_f1_m: 0.9803 - val_precision_m: 0.9809 - val_recall_m: 0.9796
Epoch 7/10
1864/1864 - 120s - loss: 0.0658 - acc: 0.9767 - f1_m: 0.9773 - precision_m: 0.9834 - recall_m: 0.9714 - val_loss: 0.0656 - val_acc: 0.9778 - val_f1_m: 0.9783 - val_precision_m: 0.9797 - val_recall_m: 0.9769
Epoch 8/10
1864/1864 - 120s - loss: 0.0619 - acc: 0.9780 - f1_m: 0.9785 - precision_m: 0.9841 - recall_m: 0.9730 - val_loss: 0.0528 - val_acc: 0.9795 - val_f1_m: 0.9801 - val_precision_m: 0.9897 - val_recall_m: 0.9707
Epoch 9/10
1864/1864 - 120s - loss: 0.0591 - acc: 0.9788 - f1_m: 0.9792 - precision_m: 0.9848 - recall_m: 0.9736 - val_loss: 0.0563 - val_acc: 0.9796 - val_f1_m: 0.9810 - val_precision_m: 0.9906 - val_recall_m: 0.9717
Epoch 10/10
1864/1864 - 120s - loss: 0.0524 - acc: 0.9806 - f1_m: 0.9810 - precision_m: 0.9869 - recall_m: 0.9751 - val_loss: 0.0469 - val_acc: 0.9830 - val_f1_m: 0.9830 - val_precision_m: 0.9833 - val_recall_m: 0.9827
In [ ]:
save_model_result(overwrite=True)

**The Pretrained Models for Text Classification**

With transfer learning we are now able to use a pre-existing model built on a huge dataset and tune it to achieve other tasks on a different dataset.

Transfer learning, and pretrained models, have 2 major advantages:

  • It has reduced the cost of training a new deep learning model every time
  • These datasets meet industry-accepted standards, and thus the pretrained models have already been vetted on the quality aspect

**1. XLNet**

Google’s latest model, XLNet achieved State-of-the-Art (SOTA) performance on the major NLP tasks such as Text Classification, Sentiment Analysis, Question Answering, and Natural Language Inference along with the essential GLUE benchmark for English.

Lets try to implement XLNet for our classification problem.The core ideas behind XLNet are:

  • Generalized Autoregressive Pretraining for Language Understanding
  • The Transformer-XL

Autoregressive modeling is used to predict the next word using the context words occurring either before or after the missing word in question. However, we can’t process both the forward and backward directions at the same time.

XLNet proposes a technique called Permutation Language Modeling during the pre-training phase. This technique uses permutations to generate information from both the forward and backward directions simultaneously.

Transformer architecture has been a game-changer. XLNet uses Transformer XL. And transformers are an alternative to recurrent neural networks (RNN) in the sense that they allowed non-adjacent tokens to be processed together as well. This improved understanding of long-distance relations in text. Transformer-XL is basically an enhanced version by adding two components:

  • A recurrence at specific segments which gives the context between 2 sequences
  • A relative positional embedding which contains information on the similarity between 2 tokens

Here, we will use the excellent library transformers which is deployed by Huggingface, this library contains some state-of-the-art pre-trained models for Natural Language Processing

The process of doing text classification with XLNet contains 4 steps:

  1. Load data
  2. Set data into training embeddings
  3. Train model
  4. Evaluate model performance
In [ ]:
!pip install transformers -q
% pip install sentencepiece
!pip install torch
Requirement already satisfied: sentencepiece in /usr/local/lib/python3.6/dist-packages (0.1.95)
Requirement already satisfied: torch in /usr/local/lib/python3.6/dist-packages (1.7.0+cu101)
Requirement already satisfied: dataclasses in /usr/local/lib/python3.6/dist-packages (from torch) (0.8)
Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch) (0.16.0)
Requirement already satisfied: typing-extensions in /usr/local/lib/python3.6/dist-packages (from torch) (3.7.4.3)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from torch) (1.19.5)
In [ ]:
from transformers import (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
import torch
from tqdm import tqdm,trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

import pandas as pd
import math
import numpy as np
from sklearn.metrics import classification_report
import torch.nn.functional as F
In [ ]:
# Check library version
!pip list | grep -E 'transformers|torch|Keras'
Keras                         2.4.3          
Keras-Preprocessing           1.1.2          
torch                         1.7.0+cu101    
torchsummary                  1.5.1          
torchtext                     0.3.1          
torchvision                   0.8.1+cu101    
transformers                  4.2.2          

**1.Load Data**

In [ ]:
text = TextPreprocessor(n_jobs=-1).transform(incidentsData_Others_upsample['New_Description'])
In [ ]:
# Get sentence data
sentences = text.to_list()
sentences[0]
Out[ ]:
'login issue verify user detail employee manager check user ad reset password advise user login check caller confirm able login issue resolve'
In [ ]:
labels = incidentsData_Others_upsample['Assignment group'].to_list()
In [ ]:
print(labels[0])
GRP_0
In [ ]:
names = incidentsData_Others_upsample['Assignment group'].unique().tolist()
grpID=[]
for grp in names:
  grpID.append(int(grp.replace("GRP_","")))

grp_mapping = dict(zip(grpID,names))

tag2idx = grp_mapping
# Mapping index to name
tag2name={tag2idx[key] : key for key in tag2idx.keys()}
Make training data

Make raw data into trainable data for XLNet, including:

  • Set gpu environment
  • Load tokenizer and tokenize
  • Set 3 embedding, token embedding, mask word embedding, segmentation embedding
  • Split data set into train and validate, then send them to dataloader

Set up gpu environment

In [ ]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
n_gpu
Out[ ]:
1

Load tokenizer
For this we will need to install sentencepiece. (if in case of issue with next line we need to restart kernel and rerun application.)

please download vocabulary from "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model"

In [ ]:
vocabulary =  project_path + 'Embeddings/xlnet/xlnet-base-cased-spiece.model'
In [ ]:
max_len  = 100
In [ ]:
tokenizer = XLNetTokenizer(vocab_file=vocabulary,do_lower_case=False)

**2.Set data into training embedding**

After we get the data, we need to set the text into 3 kinds of embeddings:

  • Token embedding
  • Mask word embedding
  • Segmentation embedding
In [ ]:
max_len  = 100

full_input_ids = []
full_input_masks = []
full_segment_ids = []

SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

UNK_ID = tokenizer.encode("<unk>")[0]
CLS_ID = tokenizer.encode("<cls>")[0]
SEP_ID = tokenizer.encode("<sep>")[0]
MASK_ID = tokenizer.encode("<mask>")[0]
EOD_ID = tokenizer.encode("<eod>")[0]

for i,sentence in enumerate(sentences):
    # Tokenize sentence to token id list
    tokens_a = tokenizer.encode(sentence)
    
    # Trim the len of text
    if(len(tokens_a)>max_len-2):
        tokens_a = tokens_a[:max_len-2]
        
        
    tokens = []
    segment_ids = []
    
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(SEG_ID_A)
        
    # Add <sep> token 
    tokens.append(SEP_ID)
    segment_ids.append(SEG_ID_A)
    
    
    # Add <cls> token
    tokens.append(CLS_ID)
    segment_ids.append(SEG_ID_CLS)
    
    input_ids = tokens
    
    # The mask has 0 for real tokens and 1 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [0] * len(input_ids)

    # Zero-pad up to the sequence length at fornt
    if len(input_ids) < max_len:
        delta_len = max_len - len(input_ids)
        input_ids = [0] * delta_len + input_ids
        input_mask = [1] * delta_len + input_mask
        segment_ids = [SEG_ID_PAD] * delta_len + segment_ids

    assert len(input_ids) == max_len
    assert len(input_mask) == max_len
    assert len(segment_ids) == max_len
    
    full_input_ids.append(input_ids)
    full_input_masks.append(input_mask)
    full_segment_ids.append(segment_ids)
    
    if 3 > i:
        print("No.:%d"%(i))
        print("sentence: %s"%(sentence))
        print("input_ids:%s"%(input_ids))
        print("attention_masks:%s"%(input_mask))
        print("segment_ids:%s"%(segment_ids))
        print("\n")
No.:0
sentence: login issue verify user detail employee manager check user ad reset password advise user login check caller confirm able login issue resolve
input_ids:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11178, 671, 9948, 1930, 4089, 3820, 1416, 1180, 1930, 24, 66, 16263, 5886, 8488, 1930, 11178, 1180, 18380, 4775, 551, 11178, 671, 4885, 4, 3, 7739, 7739]
attention_masks:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
segment_ids:[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]


No.:1
sentence: outlook receive hmjdrvpb komuaywn gmail com team meeting skype meeting etc appear outlook calendar somebody advise correct kind
input_ids:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7905, 1217, 17, 12293, 1315, 7841, 1232, 450, 508, 17, 20964, 6447, 117, 10020, 17, 299, 1635, 2748, 230, 492, 4362, 1590, 492, 1813, 1734, 7905, 5892, 6708, 8488, 2900, 713, 4, 3, 7739, 7739]
attention_masks:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
segment_ids:[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]


No.:2
sentence: not log vpn receive eylqgodm ybqkwiam gmail com log vpn
input_ids:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50, 6015, 2721, 450, 180, 1217, 17, 93, 3711, 2474, 19863, 98, 17, 117, 508, 2474, 267, 3164, 1199, 17, 299, 1635, 2748, 6015, 2721, 450, 180, 4, 3, 7739, 7739]
attention_masks:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
segment_ids:[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]


Set label embedding

In [ ]:
# Make label into id
tags = [tag2name[str(lab)] for lab in labels]
print(tags[0])
0

Split data into train and validate

70% for training, 30% for validation

In [ ]:
tr_inputs, val_inputs, tr_tags, val_tags,tr_masks, val_masks,tr_segs, val_segs = train_test_split(full_input_ids, tags,full_input_masks,full_segment_ids, 
                                                            random_state=4, test_size=0.3)
In [ ]:
len(tr_inputs),len(val_inputs),len(tr_segs),len(val_segs)
Out[ ]:
(103267, 44258, 103267, 44258)

Set data into tensor

In [ ]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)

tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)

tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

tr_segs = torch.tensor(tr_segs)
val_segs = torch.tensor(val_segs)

Put data into data loader

In [ ]:
# Set batch num
batch_num = 64
In [ ]:
# Set token embedding, attention embedding, segment embedding
train_data = TensorDataset(tr_inputs, tr_masks,tr_segs, tr_tags)
train_sampler = RandomSampler(train_data)

# Drop last can make batch training better for the last one
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_num,drop_last=True)

valid_data = TensorDataset(val_inputs, val_masks,val_segs, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_num)

**3.Train model**

When using transform learning like XLNet, the process of training a new model with downstream data called “fine-tuning”, for this we need to choose one of the XLNet pre-trained model and use our own data to update the model’s parameter to fix our downstream NLP task.

XLNet have 2 kinds of model, base cased model and large cased model. Here, we will choose XLNet base cased model for fine-tuning new model. The large version model is a bigger model than base model, which has better performance, but need more computing power and times.

In [ ]:
# Load XLNet model
# In this document, contain confg(txt) and weight(bin) files
model_file_address = 'xlnet-base-cased' 

# Download model from "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin"
# Download model from "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json"
In [ ]:
model = XLNetForSequenceClassification.from_pretrained(model_file_address,num_labels=len(tag2idx))


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
In [ ]:
model;
In [ ]:
# Set model to GPU,if you are using GPU machine
model.to(device);
In [ ]:
# Add multi GPU support
if n_gpu >1:
    model = torch.nn.DataParallel(model)
In [ ]:
# Set epoch and grad max num
epochs = 5
max_grad_norm = 1.0
In [ ]:
# Cacluate train optimiazaion num
num_train_optimization_steps = int( math.ceil(len(tr_inputs) / batch_num) / 1) * epochs

Set fine tuning method

In [ ]:
# True: fine tuning all the layers 
# False: only fine tuning the classifier layers
# Since XLNet in 'pytorch_transformer' did not contian classifier layers
# FULL_FINETUNING = True need to set True
FULL_FINETUNING = True
In [ ]:
if FULL_FINETUNING:
    # Fine tune model all layer parameters
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    # Only fine tune classifier parameters
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)
In [ ]:
# TRAIN loop
model.train();
In [ ]:
print("***** Running training *****")
print("  Num examples = %d"%(len(tr_inputs)))
print("  Batch size = %d"%(batch_num))
print("  Num steps = %d"%(num_train_optimization_steps))
for _ in trange(epochs,desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_segs,b_labels = batch
        
        # forward pass
        outputs = model(input_ids =b_input_ids,token_type_ids=b_segs, input_mask = b_input_mask,labels=b_labels)
        loss, logits = outputs[:2]
        if n_gpu>1:
            # When multi gpu, average it
            loss = loss.mean()
        
        # backward pass
        loss.backward()
        
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        
        # update parameters
        optimizer.step()
        optimizer.zero_grad()
        
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
***** Running training *****
  Num examples = 103267
  Batch size = 64
  Num steps = 8070
Epoch:  20%|██        | 1/5 [38:31<2:34:04, 2311.15s/it]
Train loss: 0.7788618238018081
Epoch:  40%|████      | 2/5 [1:17:03<1:55:34, 2311.63s/it]
Train loss: 0.0903384218618022
Epoch:  60%|██████    | 3/5 [1:55:36<1:17:03, 2311.83s/it]
Train loss: 0.056720816845611845
Epoch:  80%|████████  | 4/5 [2:34:08<38:32, 2312.07s/it]  
Train loss: 0.04476985447167928
In [ ]:
print(tr_loss)
170.39712598560436
In [ ]:
print(nb_tr_steps)
1613
In [ ]:
tr_loss = 72.2137752628185
In [ ]:
tr_loss
Out[ ]:
72.2137752628185
Save model
In [ ]:
xlnet_out_address = project_path + 'Embeddings/xlnet/models/xlnet_out_model/tc04'
In [ ]:
# Make dir if not exits
if not os.path.exists(xlnet_out_address):
        os.makedirs(xlnet_out_address)
In [ ]:
# Save a trained model, configuration and tokenizer
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
In [ ]:
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(xlnet_out_address, "pytorch_model.bin")
output_config_file = os.path.join(xlnet_out_address, "config.json")
In [ ]:
# Save model into file
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(xlnet_out_address)
Out[ ]:
('/content/drive/My Drive/Colab Notebooks/Capstone/Embeddings/xlnet/models/xlnet_out_model/tc04/spiece.model',)

**4.Evaluate model performance**

Load model
In [ ]:
model = XLNetForSequenceClassification.from_pretrained(xlnet_out_address,num_labels=len(tag2idx))
In [ ]:
# Set model to GPU
model.to(device);
In [ ]:
if n_gpu >1:
    model = torch.nn.DataParallel(model)
Eval model
In [ ]:
# Evalue loop
model.eval();
In [ ]:
# Set acc funtion
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)
In [ ]:
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

y_true = []
y_predict = []
print("***** Running evaluation *****")
print("  Num examples ={}".format(len(val_inputs)))
print("  Batch size = {}".format(batch_num))
for step, batch in enumerate(valid_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_segs,b_labels = batch
    
    with torch.no_grad():
        outputs = model(input_ids =b_input_ids,token_type_ids=b_segs, input_mask = b_input_mask,labels=b_labels)
        tmp_eval_loss, logits = outputs[:2]
    
    # Get textclassification predict result
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = accuracy(logits, label_ids)
    
    # Save predict and real label reuslt for analyze
    for predict in np.argmax(logits, axis=1):
        y_predict.append(predict)
        
    for real_result in label_ids.tolist():
        y_true.append(real_result)

    
    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy
   
    nb_eval_steps += 1
    
    
eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / len(val_inputs)
loss = tr_loss/nb_tr_steps 
result = {'eval_loss': eval_loss,
                  'eval_accuracy': eval_accuracy,
                  'loss': loss}
report = classification_report(y_pred=np.array(y_predict),y_true=np.array(y_true))

# Save the report into file
output_eval_file = os.path.join(xlnet_out_address, "eval_results.txt")
with open(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    for key in sorted(result.keys()):
        print("  %s = %s"%(key, str(result[key])))
        writer.write("%s = %s\n" % (key, str(result[key])))
        
    print(report)
    writer.write("\n\n")  
    writer.write(report)
***** Running evaluation *****
  Num examples =44258
  Batch size = 64
***** Eval results *****
  eval_accuracy = 0.9857652853721361
  eval_loss = 0.05169076022105406
  loss = 0.04476985447167917
              precision    recall  f1-score   support

           0       1.00      0.82      0.90      1151
           1       0.99      1.00      1.00       605
           2       0.96      0.97      0.97       595
           3       0.96      0.99      0.98       582
           4       0.98      1.00      0.99       573
           5       0.97      1.00      0.99       591
           6       0.98      1.00      0.99       590
           7       0.99      1.00      0.99       604
           8       0.99      0.92      0.95       575
           9       0.99      0.99      0.99       586
          10       0.99      1.00      1.00       574
          11       1.00      1.00      1.00       585
          12       0.99      0.96      0.98       567
          13       0.98      1.00      0.99       572
          14       0.98      1.00      0.99       608
          15       1.00      1.00      1.00       583
          16       0.97      1.00      0.99       582
          17       1.00      1.00      1.00       608
          18       1.00      1.00      1.00       636
          19       0.98      0.99      0.98       569
          20       0.99      1.00      1.00       597
          21       0.99      1.00      1.00       598
          22       1.00      1.00      1.00       593
          23       1.00      1.00      1.00       632
          24       0.99      1.00      1.00       590
          25       0.99      0.99      0.99       589
          26       0.99      1.00      0.99       590
          27       0.99      1.00      0.99       589
          28       0.99      1.00      1.00       577
          29       1.00      1.00      1.00       609
          30       0.96      0.57      0.72       621
          31       1.00      0.91      0.96       593
          32       1.00      1.00      1.00       587
          33       0.99      1.00      0.99       570
          34       0.99      0.99      0.99       610
          35       1.00      1.00      1.00       579
          36       0.99      1.00      0.99       566
          37       1.00      1.00      1.00       577
          38       1.00      1.00      1.00       597
          39       0.99      1.00      0.99       531
          40       1.00      1.00      1.00       629
          41       1.00      1.00      1.00       630
          42       0.99      1.00      1.00       569
          43       1.00      1.00      1.00       594
          44       0.99      1.00      1.00       611
          45       0.99      1.00      1.00       571
          46       1.00      1.00      1.00       595
          47       1.00      1.00      1.00       569
          48       0.67      1.00      0.80       599
          49       1.00      1.00      1.00       594
          50       0.98      1.00      0.99       591
          51       1.00      1.00      1.00       587
          52       1.00      1.00      1.00       588
          53       1.00      1.00      1.00       613
          54       1.00      1.00      1.00       638
          55       1.00      1.00      1.00       590
          56       1.00      1.00      1.00       603
          57       1.00      1.00      1.00       581
          58       1.00      1.00      1.00       577
          59       1.00      1.00      1.00       625
          60       0.99      1.00      0.99       589
          61       1.00      1.00      1.00       562
          62       0.99      1.00      1.00       592
          63       1.00      1.00      1.00       568
          64       1.00      1.00      1.00       589
          65       1.00      1.00      1.00       624
          66       1.00      1.00      1.00       573
          67       1.00      1.00      1.00       548
          68       1.00      1.00      1.00       586
          69       1.00      1.00      1.00       581
          70       1.00      1.00      1.00       568
          71       1.00      1.00      1.00       602
          72       0.94      1.00      0.97       590
          73       1.00      1.00      1.00       601

    accuracy                           0.99     44258
   macro avg       0.99      0.99      0.99     44258
weighted avg       0.99      0.99      0.99     44258

In [ ]:
eval_accuracy 
Out[ ]:
0.9857652853721361
In [ ]:
# From above report
update_model_score('XLNet with Over-Sampling','-',eval_accuracy,0.99,0.99,0.98)
Out[ ]:
New_ID Model Train_Acc Test_Acc Precision Recall F1_Score
0 1.0 ExtraTrees - Original Data 0.992870934 0.668646 0.658952685 0.711757269 0.684337867
1 2.0 SVM - Original Data 0.860686173 0.691211 0.67933036 0.741873805 0.709225897
2 3.0 Multinomial Naïve Bayes - Original Data 0.557700876 0.559976 0.594468028 0.85494107 0.701299768
3 4.0 Naïve Bayes - Original Data 0.934798752 0.571259 0.596109492 0.577777778 0.586800499
4 5.0 SGD Classifier - Original Data 0.703698203 0.631235 0.639680854 0.693411611 0.665463414
5 6.0 Decision Tree - Original Data 0.992870934 0.589667 0.595145411 0.595323741 0.595234563
6 7.0 Random Forest - Original Data 0.990197535 0.605107 0.653851539 0.634100809 0.643824735
7 8.0 AdaBoost - Original Data 0.513886826 0.523753 0.52093719 0.950431034 0.672999272
8 9.0 Bagging - Original Data 0.99257389 0.665677 0.658028094 0.684371184 0.670941162
9 10.0 Gradient Boosting - Original Data 0.982919947 0.630641 0.613589302 0.637072585 0.625110475
10 11.0 Bagging with Over-Sampling 0.990603288 0.982647 0.98414043 0.982647009 0.983393152
11 12.0 ExtraTrees with Over-Sampling 0.990899847 0.989019 0.990434274 0.98901881 0.989726036
12 13.0 LSTM with Word2Vec(Simple Averaging) 0.4640921 0.482779 - - -
13 14.0 LSTM with Word2Vec(TF-IDF Weighted Averaging) 0.464649099 0.482779 - - -
14 15.0 LSTM with Doc2Vec 0.466394365 0.482779 - - -
15 16.0 LSTM with FastText 0.527330112 0.564133 0.839545369 0.354599059 0.495131284
16 17.0 LSTM with Glove 0.626680285 0.634204 0.756910622 0.566981137 0.64677012
17 18.0 LSTM with Glove using SMOTE 0.950081855 0.983426 0.993631899 0.975291908 0.984238148
18 19.0 RNN with Glove with SMOTE 0.961369842 0.973035 0.990182221 0.963907957 0.976669252
19 20.0 GRU with Glove with SMOTE 0.953051579 0.982980 0.98321569 0.982692301 0.982949674
20 21.0 XLNet with Over-Sampling - 0.985268 0.99 0.99 0.98
21 22.0 BERT with Original Data - 0.690000 - - -
22 23.0 BERT with Over-Sampling for training - 0.900000 - - -
0 NaN XLNet with Over-Sampling - 0.900000 0.99 0.99 0.98
0 NaN XLNet with Over-Sampling - 0.985765 0.99 0.99 0.98
In [ ]:
resultsDf
Out[ ]:
Model Train_Acc Test_Acc Precision Recall F1_Score
0 XLNet with Over-Sampling - 0.985268 0.99 0.99 0.98

**2. BERT**

**BERT Common Functions**

In [32]:
!pip install transformers -q
!pip install torch
     |████████████████████████████████| 1.8MB 16.8MB/s 
     |████████████████████████████████| 2.9MB 50.7MB/s 
     |████████████████████████████████| 890kB 47.7MB/s 
  Building wheel for sacremoses (setup.py) ... done
Requirement already satisfied: torch in /usr/local/lib/python3.6/dist-packages (1.7.0+cu101)
Requirement already satisfied: typing-extensions in /usr/local/lib/python3.6/dist-packages (from torch) (3.7.4.3)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from torch) (1.19.5)
Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch) (0.16.0)
Requirement already satisfied: dataclasses in /usr/local/lib/python3.6/dist-packages (from torch) (0.8)
In [33]:
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
In [34]:
from sklearn.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
    
def accuracy(preds, labels):
    outputs = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(outputs == labels_flat)

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')
In [35]:
def get_transormedX_and_labels(df):
  all_labels = df['Assignment group'].unique()

  label_dict = {}
  for index, label in enumerate(all_labels):
      label_dict[label] = index
  label_dict

  df['label'] = df['Assignment group'].replace(label_dict)
  X = TextPreprocessor(n_jobs=-1).transform(df['New_Description'])
  return df, X, label_dict
In [36]:
def evaluate(dataloader_val):

    model.eval()    
    loss_val_total = 0
    predictions, true_vals = [], []
    total_eval_accuracy = 0
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
        
        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

     # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(dataloader_val)
  
            
    return loss_val_avg, predictions, true_vals, avg_val_accuracy
In [37]:
from transformers import BertModel, BertConfig,BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import TensorDataset
from tqdm import tqdm,trange
def get_train_val_dataset(df):
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                            do_lower_case=True)
                                            
  encoded_data_train = tokenizer.batch_encode_plus(
      df[df.data_type=='train'].New_Description.values, 
      add_special_tokens=True, 
      return_attention_mask=True, 
      padding='longest',
      max_length=256, 
      return_tensors='pt',truncation=True
  )

  encoded_data_val = tokenizer.batch_encode_plus(
      df[df.data_type=='val'].New_Description.values, 
      add_special_tokens=True, 
      return_attention_mask=True, 
      padding='longest',
      max_length=256, 
      return_tensors='pt',truncation=True
  )


  input_ids_train = encoded_data_train['input_ids']
  attention_masks_train = encoded_data_train['attention_mask']
  labels_train = torch.tensor(df[df.data_type=='train'].label.values)

  input_ids_val = encoded_data_val['input_ids']
  attention_masks_val = encoded_data_val['attention_mask']
  labels_val = torch.tensor(df[df.data_type=='val'].label.values)

  dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
  dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
  return dataset_train, dataset_val

**BERT Model**

In [ ]:
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split


df = incidentsData.copy()
In [ ]:
# Lets find out Groups with ticket count only 1, for such groups we cant split the data set in a stratified fashion
minor_df = df.groupby('Assignment group').filter(lambda x: len(x) <= 1)
In [ ]:
minor_df
Out[ ]:
Short description Description New_Description Caller Assignment group Description_pos_tagged
401 need access to erp kp06789 need access to kp06789 to enter forecast for i... need access to erp kp06789 need access to kp06... etvendormhd xpslzunb GRP_35 [(need, NN), (access, NN), (to, TO), (erp, VB)...
3036 (srvlavpwdrprd01.company.company.com) is not r... (srvlavpwdrprd01.company.company.com) is not r... (srvlavpwdrprd01.company.company.com) is not r... vushymxe ifrbzdtl GRP_61 [((, (), (srvlavpwdrprd01.company.company.com,...
3628 r: ticket_no1402627 change in report zsdslsum ... \r\n\r\nreceived from: xawlkiey.demjqrfl@gmail... r: ticket_no1402627 change in report zsdslsum ... xawlkiey demjqrfl GRP_64 [(r, NN), (:, :), (ticket_no1402627, NN), (cha...
5154 unable to complete forecast unable to complete forecast\n\njochegtyhu is o... unable to complete forecast unable to complete... fnqelwpk ahrskvln GRP_67 [(unable, JJ), (to, TO), (complete, VB), (fore...
6401 a link on an e-mail says i am "forbidden" an e-mail from it training has email hints an... a link on an e-mail says i am "forbidden" an e... mfvkxghn mzjasxqd GRP_70 [(a, DT), (link, NN), (on, IN), (an, DT), (e-m...
8197 oneteam sso not working i'm unable to log in to hr_tool/oneteam throug... oneteam sso not working i'm unable to log in t... kcnosyae zlpmfxgs GRP_73 [(oneteam, NN), (sso, NN), (not, RB), (working...
In [ ]:
df.shape
Out[ ]:
(8417, 6)
In [ ]:
# Treat the imbalnce in the 'other' dataset by resampling
from sklearn.utils import resample

incidentsData_upsampled = minor_df[0:0]

# Upsample minority class
for grp in minor_df['Assignment group'].unique():
    incidentsData_Group = df[df['Assignment group'] == grp]
    resampled = resample(incidentsData_Group, 
                         replace=True, # sample with replacement
                         n_samples=2, 
                         random_state=123) # reproducible results
    
    incidentsData_upsampled = incidentsData_upsampled.append(resampled)

frames = [df,incidentsData_upsampled]
df = pd.concat(frames)
In [ ]:
df, X = get_transormedX_and_labels(df)
In [ ]:
x_train, x_test, y_train, y_test  = train_test_split(X.index.values,
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=df.label.values)

df['data_type'] = ['not_set']*df.shape[0]

df.loc[x_train, 'data_type'] = 'train'
df.loc[x_test, 'data_type'] = 'val'
In [ ]:
dataset_train, dataset_val = get_train_val_dataset(df)
In [ ]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
In [ ]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)
In [ ]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=2e-5, 
                  eps=1e-8)
                  
epochs = 5
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)
In [ ]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
In [ ]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
In [ ]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(),    project_path + f'data_volume/finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals, avg_val_accuracy = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write(f'Accuracy: {avg_val_accuracy}')
    
In [ ]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

model.load_state_dict(torch.load(project_path + 'data_volume/finetuned_BERT_epoch_5.model', map_location=torch.device('cpu')))

avg_val_loss, predictions, true_vals, avg_val_accuracy = evaluate(dataloader_validation)
print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
print("  Validation Loss: {0:.2f}".format(avg_val_loss))
# accuracy_per_class(predictions, true_vals)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Accuracy: 0.69
  Validation Loss: 1.61
In [ ]:
# From above report
update_model_score('BERT with Original Data','-',0.69,'-', '-','-')
Out[ ]:
New_ID Model Train_Acc Test_Acc Precision Recall F1_Score
0 1.0 ExtraTrees - Original Data 0.9928709342046635 0.668646 0.6589526845175626 0.7117572692793932 0.6843378674202307
1 2.0 SVM - Original Data 0.8606861725828011 0.691211 0.6793303604166993 0.7418738049713193 0.7092258967271996
2 3.0 Naïve Bayes - Original Data 0.934798752413486 0.571259 0.5961094920637404 0.5777777777777777 0.5868004986259479
3 4.0 Multinomial Naïve Bayes - Original Data 0.5577008762810041 0.559976 0.5944680282493915 0.8549410698096102 0.701299767911979
4 5.0 SGD Classifier - Original Data 0.7036982028813308 0.631235 0.6396808540656695 0.6934116112198304 0.6654634141813385
5 6.0 Naïve Bayes - Original Data 0.934798752413486 0.571259 0.5961094920637404 0.5777777777777777 0.5868004986259479
6 7.0 Multinomial Naïve Bayes - Original Data 0.5577008762810041 0.559976 0.5944680282493915 0.8549410698096102 0.701299767911979
7 8.0 SGD Classifier - Original Data 0.7036982028813308 0.631235 0.6396808540656695 0.6934116112198304 0.6654634141813385
8 9.0 Decision Tree - Original Data 0.9928709342046635 0.589667 0.5951454112657192 0.5953237410071942 0.5952345627797491
9 10.0 Random Forest - Original Data 0.9901975345314123 0.605107 0.6538515389202167 0.6341008089607966 0.6438247353664897
10 11.0 AdaBoost - Original Data 0.5138868260804991 0.523753 0.5209371898605262 0.9504310344827587 0.6729992724705817
11 12.0 Bagging - Original Data 0.9925738897965246 0.665677 0.6580280938132603 0.6843711843711844 0.6709411621876901
12 13.0 Gradient Boosting - Original Data 0.9829199465320064 0.630641 0.6135893023755332 0.6370725854829035 0.625110474835671
13 14.0 Bagging with Over-Sampling 0.9906032875783766 0.982647 0.9841404299176464 0.9826470089815286 0.9833931524574336
14 15.0 ExtraTrees with Over-Sampling 0.9908998474834774 0.989019 0.9904342737860568 0.9890188103711235 0.9897260359951939
15 16.0 LSTM with Word2Vec(Simple Averaging) 0.4640920996665955 0.482779 - - -
16 17.0 LSTM with Word2Vec(TF-IDF Weighted Averaging) 0.464649099111557 0.482779 - - -
17 18.0 LSTM with Doc2Vec 0.4663943648338318 0.482779 - - -
18 19.0 LSTM with FastText 0.5273301124572753 0.564133 0.8395453691482544 0.3545990586280823 0.4951312839984894
19 20.0 LSTM with Glove 0.6266802847385406 0.634204 0.7569106221199036 0.5669811367988586 0.6467701196670532
20 21.0 LSTM with Glove using SMOTE 0.95008185505867 0.983426 0.993631899356842 0.975291907787323 0.9842381477355957
21 22.0 XLNet with Over-Sampling - 0.985268 0.99 0.99 0.98
0 NaN BERT with Original Data - 0.690000 - - -

**2. BERT using up sampling for training**

In [ ]:
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
df = incidentsData_Others_upsample.copy()
In [ ]:
df, X, label_dict  = get_transormedX_and_labels(df)
In [ ]:
x_train, x_test, y_train, y_test  = train_test_split(X.index.values,
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=df.label.values)

df['data_type'] = ['not_set']*df.shape[0]

df.loc[x_train, 'data_type'] = 'train'
df.loc[x_test, 'data_type'] = 'val'

dataset_train, dataset_val = get_train_val_dataset(df)
In [ ]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 3

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  
epochs = 5
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
In [ ]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
In [ ]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

**Model Training**

In [ ]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(),    project_path + f'data_volume/over_finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals, avg_val_accuracy = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write(f'Accuracy: {avg_val_accuracy}')
Streaming output truncated to the last 5000 lines.
Epoch 1:  26%|██▌       | 10678/41242 [34:42<1:39:07,  5.14it/s, training_loss=0.623]
Epoch 1:  26%|██▌       | 10678/41242 [34:42<1:39:07,  5.14it/s, training_loss=0.005]
Epoch 1:  26%|██▌       | 10679/41242 [34:42<1:38:05,  5.19it/s, training_loss=0.005]
Epoch 1:  26%|██▌       | 10679/41242 [34:42<1:38:05,  5.19it/s, training_loss=0.010]
Epoch 1:  26%|██▌       | 10680/41242 [34:42<1:39:15,  5.13it/s, training_loss=0.010]
Epoch 1:  26%|██▌       | 10680/41242 [34:43<1:39:15,  5.13it/s, training_loss=0.236]
Epoch 1:  26%|██▌       | 10681/41242 [34:43<1:41:05,  5.04it/s, training_loss=0.236]
Epoch 1:  26%|██▌       | 10681/41242 [34:43<1:41:05,  5.04it/s, training_loss=0.041]
Epoch 1:  26%|██▌       | 10682/41242 [34:43<1:42:23,  4.97it/s, training_loss=0.041]
Epoch 1:  26%|██▌       | 10682/41242 [34:43<1:42:23,  4.97it/s, training_loss=0.005]
Epoch 1:  26%|██▌       | 10683/41242 [34:43<1:40:56,  5.05it/s, training_loss=0.005]
Epoch 1:  26%|██▌       | 10683/41242 [34:43<1:40:56,  5.05it/s, training_loss=0.009]
Epoch 1:  26%|██▌       | 10684/41242 [34:43<1:40:28,  5.07it/s, training_loss=0.009]
Epoch 1:  26%|██▌       | 10684/41242 [34:43<1:40:28,  5.07it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10685/41242 [34:43<1:39:09,  5.14it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10685/41242 [34:44<1:39:09,  5.14it/s, training_loss=0.277]
Epoch 1:  26%|██▌       | 10686/41242 [34:44<1:38:30,  5.17it/s, training_loss=0.277]
Epoch 1:  26%|██▌       | 10686/41242 [34:44<1:38:30,  5.17it/s, training_loss=0.021]
Epoch 1:  26%|██▌       | 10687/41242 [34:44<1:41:31,  5.02it/s, training_loss=0.021]
Epoch 1:  26%|██▌       | 10687/41242 [34:44<1:41:31,  5.02it/s, training_loss=0.007]
Epoch 1:  26%|██▌       | 10688/41242 [34:44<1:41:21,  5.02it/s, training_loss=0.007]
Epoch 1:  26%|██▌       | 10688/41242 [34:44<1:41:21,  5.02it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10689/41242 [34:44<1:39:57,  5.09it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10689/41242 [34:44<1:39:57,  5.09it/s, training_loss=0.005]
Epoch 1:  26%|██▌       | 10690/41242 [34:44<1:38:48,  5.15it/s, training_loss=0.005]
Epoch 1:  26%|██▌       | 10690/41242 [34:45<1:38:48,  5.15it/s, training_loss=0.619]
Epoch 1:  26%|██▌       | 10691/41242 [34:45<1:40:54,  5.05it/s, training_loss=0.619]
Epoch 1:  26%|██▌       | 10691/41242 [34:45<1:40:54,  5.05it/s, training_loss=0.006]
Epoch 1:  26%|██▌       | 10692/41242 [34:45<1:40:53,  5.05it/s, training_loss=0.006]
Epoch 1:  26%|██▌       | 10692/41242 [34:45<1:40:53,  5.05it/s, training_loss=0.006]
Epoch 1:  26%|██▌       | 10693/41242 [34:45<1:42:22,  4.97it/s, training_loss=0.006]
Epoch 1:  26%|██▌       | 10693/41242 [34:45<1:42:22,  4.97it/s, training_loss=0.005]
Epoch 1:  26%|██▌       | 10694/41242 [34:45<1:41:10,  5.03it/s, training_loss=0.005]
Epoch 1:  26%|██▌       | 10694/41242 [34:45<1:41:10,  5.03it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10695/41242 [34:45<1:40:38,  5.06it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10695/41242 [34:46<1:40:38,  5.06it/s, training_loss=0.019]
Epoch 1:  26%|██▌       | 10696/41242 [34:46<1:40:36,  5.06it/s, training_loss=0.019]
Epoch 1:  26%|██▌       | 10696/41242 [34:46<1:40:36,  5.06it/s, training_loss=0.026]
Epoch 1:  26%|██▌       | 10697/41242 [34:46<1:41:17,  5.03it/s, training_loss=0.026]
Epoch 1:  26%|██▌       | 10697/41242 [34:46<1:41:17,  5.03it/s, training_loss=0.362]
Epoch 1:  26%|██▌       | 10698/41242 [34:46<1:42:04,  4.99it/s, training_loss=0.362]
Epoch 1:  26%|██▌       | 10698/41242 [34:46<1:42:04,  4.99it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10699/41242 [34:46<1:44:38,  4.86it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10699/41242 [34:46<1:44:38,  4.86it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10700/41242 [34:46<1:43:39,  4.91it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10700/41242 [34:47<1:43:39,  4.91it/s, training_loss=0.011]
Epoch 1:  26%|██▌       | 10701/41242 [34:47<1:43:45,  4.91it/s, training_loss=0.011]
Epoch 1:  26%|██▌       | 10701/41242 [34:47<1:43:45,  4.91it/s, training_loss=0.007]
Epoch 1:  26%|██▌       | 10702/41242 [34:47<1:43:15,  4.93it/s, training_loss=0.007]
Epoch 1:  26%|██▌       | 10702/41242 [34:47<1:43:15,  4.93it/s, training_loss=0.002]
Epoch 1:  26%|██▌       | 10703/41242 [34:47<1:43:05,  4.94it/s, training_loss=0.002]
Epoch 1:  26%|██▌       | 10703/41242 [34:47<1:43:05,  4.94it/s, training_loss=0.019]
Epoch 1:  26%|██▌       | 10704/41242 [34:47<1:44:39,  4.86it/s, training_loss=0.019]
Epoch 1:  26%|██▌       | 10704/41242 [34:47<1:44:39,  4.86it/s, training_loss=0.135]
Epoch 1:  26%|██▌       | 10705/41242 [34:47<1:43:59,  4.89it/s, training_loss=0.135]
Epoch 1:  26%|██▌       | 10705/41242 [34:48<1:43:59,  4.89it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10706/41242 [34:48<1:41:15,  5.03it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10706/41242 [34:48<1:41:15,  5.03it/s, training_loss=0.124]
Epoch 1:  26%|██▌       | 10707/41242 [34:48<1:42:05,  4.98it/s, training_loss=0.124]
Epoch 1:  26%|██▌       | 10707/41242 [34:48<1:42:05,  4.98it/s, training_loss=0.003]
Epoch 1:  26%|██▌       | 10708/41242 [34:48<1:40:39,  5.06it/s, training_loss=0.003]
Epoch 1:  26%|██▌       | 10708/41242 [34:48<1:40:39,  5.06it/s, training_loss=0.343]
Epoch 1:  26%|██▌       | 10709/41242 [34:48<1:40:44,  5.05it/s, training_loss=0.343]
Epoch 1:  26%|██▌       | 10709/41242 [34:48<1:40:44,  5.05it/s, training_loss=0.014]
Epoch 1:  26%|██▌       | 10710/41242 [34:48<1:40:47,  5.05it/s, training_loss=0.014]
Epoch 1:  26%|██▌       | 10710/41242 [34:49<1:40:47,  5.05it/s, training_loss=0.455]
Epoch 1:  26%|██▌       | 10711/41242 [34:49<1:40:23,  5.07it/s, training_loss=0.455]
Epoch 1:  26%|██▌       | 10711/41242 [34:49<1:40:23,  5.07it/s, training_loss=0.015]
Epoch 1:  26%|██▌       | 10712/41242 [34:49<1:42:42,  4.95it/s, training_loss=0.015]
Epoch 1:  26%|██▌       | 10712/41242 [34:49<1:42:42,  4.95it/s, training_loss=0.028]
Epoch 1:  26%|██▌       | 10713/41242 [34:49<1:42:08,  4.98it/s, training_loss=0.028]
Epoch 1:  26%|██▌       | 10713/41242 [34:49<1:42:08,  4.98it/s, training_loss=0.005]
Epoch 1:  26%|██▌       | 10714/41242 [34:49<1:42:02,  4.99it/s, training_loss=0.005]
Epoch 1:  26%|██▌       | 10714/41242 [34:49<1:42:02,  4.99it/s, training_loss=0.036]
Epoch 1:  26%|██▌       | 10715/41242 [34:49<1:42:38,  4.96it/s, training_loss=0.036]
Epoch 1:  26%|██▌       | 10715/41242 [34:50<1:42:38,  4.96it/s, training_loss=0.076]
Epoch 1:  26%|██▌       | 10716/41242 [34:50<1:43:23,  4.92it/s, training_loss=0.076]
Epoch 1:  26%|██▌       | 10716/41242 [34:50<1:43:23,  4.92it/s, training_loss=0.013]
Epoch 1:  26%|██▌       | 10717/41242 [34:50<1:42:43,  4.95it/s, training_loss=0.013]
Epoch 1:  26%|██▌       | 10717/41242 [34:50<1:42:43,  4.95it/s, training_loss=0.099]
Epoch 1:  26%|██▌       | 10718/41242 [34:50<1:42:07,  4.98it/s, training_loss=0.099]
Epoch 1:  26%|██▌       | 10718/41242 [34:50<1:42:07,  4.98it/s, training_loss=0.006]
Epoch 1:  26%|██▌       | 10719/41242 [34:50<1:41:51,  4.99it/s, training_loss=0.006]
Epoch 1:  26%|██▌       | 10719/41242 [34:50<1:41:51,  4.99it/s, training_loss=0.006]
Epoch 1:  26%|██▌       | 10720/41242 [34:50<1:43:13,  4.93it/s, training_loss=0.006]
Epoch 1:  26%|██▌       | 10720/41242 [34:51<1:43:13,  4.93it/s, training_loss=0.382]
Epoch 1:  26%|██▌       | 10721/41242 [34:51<1:43:15,  4.93it/s, training_loss=0.382]
Epoch 1:  26%|██▌       | 10721/41242 [34:51<1:43:15,  4.93it/s, training_loss=0.073]
Epoch 1:  26%|██▌       | 10722/41242 [34:51<1:42:55,  4.94it/s, training_loss=0.073]
Epoch 1:  26%|██▌       | 10722/41242 [34:51<1:42:55,  4.94it/s, training_loss=0.003]
Epoch 1:  26%|██▌       | 10723/41242 [34:51<1:43:17,  4.92it/s, training_loss=0.003]
Epoch 1:  26%|██▌       | 10723/41242 [34:51<1:43:17,  4.92it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10724/41242 [34:51<1:43:00,  4.94it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10724/41242 [34:51<1:43:00,  4.94it/s, training_loss=0.018]
Epoch 1:  26%|██▌       | 10725/41242 [34:51<1:45:09,  4.84it/s, training_loss=0.018]
Epoch 1:  26%|██▌       | 10725/41242 [34:52<1:45:09,  4.84it/s, training_loss=0.165]
Epoch 1:  26%|██▌       | 10726/41242 [34:52<1:44:16,  4.88it/s, training_loss=0.165]
Epoch 1:  26%|██▌       | 10726/41242 [34:52<1:44:16,  4.88it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10727/41242 [34:52<1:44:14,  4.88it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10727/41242 [34:52<1:44:14,  4.88it/s, training_loss=0.007]
Epoch 1:  26%|██▌       | 10728/41242 [34:52<1:43:41,  4.90it/s, training_loss=0.007]
Epoch 1:  26%|██▌       | 10728/41242 [34:52<1:43:41,  4.90it/s, training_loss=0.680]
Epoch 1:  26%|██▌       | 10729/41242 [34:52<1:42:18,  4.97it/s, training_loss=0.680]
Epoch 1:  26%|██▌       | 10729/41242 [34:52<1:42:18,  4.97it/s, training_loss=0.100]
Epoch 1:  26%|██▌       | 10730/41242 [34:52<1:40:48,  5.04it/s, training_loss=0.100]
Epoch 1:  26%|██▌       | 10730/41242 [34:53<1:40:48,  5.04it/s, training_loss=0.040]
Epoch 1:  26%|██▌       | 10731/41242 [34:53<1:40:01,  5.08it/s, training_loss=0.040]
Epoch 1:  26%|██▌       | 10731/41242 [34:53<1:40:01,  5.08it/s, training_loss=0.049]
Epoch 1:  26%|██▌       | 10732/41242 [34:53<1:40:11,  5.08it/s, training_loss=0.049]
Epoch 1:  26%|██▌       | 10732/41242 [34:53<1:40:11,  5.08it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10733/41242 [34:53<1:39:26,  5.11it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10733/41242 [34:53<1:39:26,  5.11it/s, training_loss=0.006]
Epoch 1:  26%|██▌       | 10734/41242 [34:53<1:39:29,  5.11it/s, training_loss=0.006]
Epoch 1:  26%|██▌       | 10734/41242 [34:53<1:39:29,  5.11it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10735/41242 [34:53<1:38:19,  5.17it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10735/41242 [34:54<1:38:19,  5.17it/s, training_loss=0.028]
Epoch 1:  26%|██▌       | 10736/41242 [34:54<1:40:05,  5.08it/s, training_loss=0.028]
Epoch 1:  26%|██▌       | 10736/41242 [34:54<1:40:05,  5.08it/s, training_loss=0.010]
Epoch 1:  26%|██▌       | 10737/41242 [34:54<1:41:24,  5.01it/s, training_loss=0.010]
Epoch 1:  26%|██▌       | 10737/41242 [34:54<1:41:24,  5.01it/s, training_loss=0.003]
Epoch 1:  26%|██▌       | 10738/41242 [34:54<1:42:38,  4.95it/s, training_loss=0.003]
Epoch 1:  26%|██▌       | 10738/41242 [34:54<1:42:38,  4.95it/s, training_loss=0.185]
Epoch 1:  26%|██▌       | 10739/41242 [34:54<1:42:48,  4.95it/s, training_loss=0.185]
Epoch 1:  26%|██▌       | 10739/41242 [34:54<1:42:48,  4.95it/s, training_loss=0.010]
Epoch 1:  26%|██▌       | 10740/41242 [34:54<1:40:16,  5.07it/s, training_loss=0.010]
Epoch 1:  26%|██▌       | 10740/41242 [34:55<1:40:16,  5.07it/s, training_loss=0.012]
Epoch 1:  26%|██▌       | 10741/41242 [34:55<1:38:46,  5.15it/s, training_loss=0.012]
Epoch 1:  26%|██▌       | 10741/41242 [34:55<1:38:46,  5.15it/s, training_loss=0.008]
Epoch 1:  26%|██▌       | 10742/41242 [34:55<1:37:18,  5.22it/s, training_loss=0.008]
Epoch 1:  26%|██▌       | 10742/41242 [34:55<1:37:18,  5.22it/s, training_loss=0.003]
Epoch 1:  26%|██▌       | 10743/41242 [34:55<1:36:31,  5.27it/s, training_loss=0.003]
Epoch 1:  26%|██▌       | 10743/41242 [34:55<1:36:31,  5.27it/s, training_loss=0.071]
Epoch 1:  26%|██▌       | 10744/41242 [34:55<1:36:49,  5.25it/s, training_loss=0.071]
Epoch 1:  26%|██▌       | 10744/41242 [34:55<1:36:49,  5.25it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10745/41242 [34:55<1:39:59,  5.08it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10745/41242 [34:56<1:39:59,  5.08it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10746/41242 [34:56<1:40:58,  5.03it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10746/41242 [34:56<1:40:58,  5.03it/s, training_loss=0.009]
Epoch 1:  26%|██▌       | 10747/41242 [34:56<1:40:35,  5.05it/s, training_loss=0.009]
Epoch 1:  26%|██▌       | 10747/41242 [34:56<1:40:35,  5.05it/s, training_loss=0.006]
Epoch 1:  26%|██▌       | 10748/41242 [34:56<1:40:04,  5.08it/s, training_loss=0.006]
Epoch 1:  26%|██▌       | 10748/41242 [34:56<1:40:04,  5.08it/s, training_loss=0.574]
Epoch 1:  26%|██▌       | 10749/41242 [34:56<1:39:49,  5.09it/s, training_loss=0.574]
Epoch 1:  26%|██▌       | 10749/41242 [34:56<1:39:49,  5.09it/s, training_loss=0.009]
Epoch 1:  26%|██▌       | 10750/41242 [34:56<1:39:41,  5.10it/s, training_loss=0.009]
Epoch 1:  26%|██▌       | 10750/41242 [34:56<1:39:41,  5.10it/s, training_loss=0.010]
Epoch 1:  26%|██▌       | 10751/41242 [34:56<1:38:38,  5.15it/s, training_loss=0.010]
Epoch 1:  26%|██▌       | 10751/41242 [34:57<1:38:38,  5.15it/s, training_loss=0.005]
Epoch 1:  26%|██▌       | 10752/41242 [34:57<1:38:33,  5.16it/s, training_loss=0.005]
Epoch 1:  26%|██▌       | 10752/41242 [34:57<1:38:33,  5.16it/s, training_loss=0.011]
Epoch 1:  26%|██▌       | 10753/41242 [34:57<1:38:45,  5.15it/s, training_loss=0.011]
Epoch 1:  26%|██▌       | 10753/41242 [34:57<1:38:45,  5.15it/s, training_loss=0.095]
Epoch 1:  26%|██▌       | 10754/41242 [34:57<1:38:31,  5.16it/s, training_loss=0.095]
Epoch 1:  26%|██▌       | 10754/41242 [34:57<1:38:31,  5.16it/s, training_loss=0.019]
Epoch 1:  26%|██▌       | 10755/41242 [34:57<1:38:03,  5.18it/s, training_loss=0.019]
Epoch 1:  26%|██▌       | 10755/41242 [34:57<1:38:03,  5.18it/s, training_loss=0.642]
Epoch 1:  26%|██▌       | 10756/41242 [34:57<1:37:52,  5.19it/s, training_loss=0.642]
Epoch 1:  26%|██▌       | 10756/41242 [34:58<1:37:52,  5.19it/s, training_loss=0.026]
Epoch 1:  26%|██▌       | 10757/41242 [34:58<1:38:36,  5.15it/s, training_loss=0.026]
Epoch 1:  26%|██▌       | 10757/41242 [34:58<1:38:36,  5.15it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10758/41242 [34:58<1:40:56,  5.03it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10758/41242 [34:58<1:40:56,  5.03it/s, training_loss=0.291]
Epoch 1:  26%|██▌       | 10759/41242 [34:58<1:41:02,  5.03it/s, training_loss=0.291]
Epoch 1:  26%|██▌       | 10759/41242 [34:58<1:41:02,  5.03it/s, training_loss=0.961]
Epoch 1:  26%|██▌       | 10760/41242 [34:58<1:42:02,  4.98it/s, training_loss=0.961]
Epoch 1:  26%|██▌       | 10760/41242 [34:58<1:42:02,  4.98it/s, training_loss=0.206]
Epoch 1:  26%|██▌       | 10761/41242 [34:58<1:40:49,  5.04it/s, training_loss=0.206]
Epoch 1:  26%|██▌       | 10761/41242 [34:59<1:40:49,  5.04it/s, training_loss=0.356]
Epoch 1:  26%|██▌       | 10762/41242 [34:59<1:40:05,  5.08it/s, training_loss=0.356]
Epoch 1:  26%|██▌       | 10762/41242 [34:59<1:40:05,  5.08it/s, training_loss=0.003]
Epoch 1:  26%|██▌       | 10763/41242 [34:59<1:39:04,  5.13it/s, training_loss=0.003]
Epoch 1:  26%|██▌       | 10763/41242 [34:59<1:39:04,  5.13it/s, training_loss=0.014]
Epoch 1:  26%|██▌       | 10764/41242 [34:59<1:43:19,  4.92it/s, training_loss=0.014]
Epoch 1:  26%|██▌       | 10764/41242 [34:59<1:43:19,  4.92it/s, training_loss=0.247]
Epoch 1:  26%|██▌       | 10765/41242 [34:59<1:43:13,  4.92it/s, training_loss=0.247]
Epoch 1:  26%|██▌       | 10765/41242 [34:59<1:43:13,  4.92it/s, training_loss=0.002]
Epoch 1:  26%|██▌       | 10766/41242 [34:59<1:43:20,  4.92it/s, training_loss=0.002]
Epoch 1:  26%|██▌       | 10766/41242 [35:00<1:43:20,  4.92it/s, training_loss=0.032]
Epoch 1:  26%|██▌       | 10767/41242 [35:00<1:41:20,  5.01it/s, training_loss=0.032]
Epoch 1:  26%|██▌       | 10767/41242 [35:00<1:41:20,  5.01it/s, training_loss=0.056]
Epoch 1:  26%|██▌       | 10768/41242 [35:00<1:40:00,  5.08it/s, training_loss=0.056]
Epoch 1:  26%|██▌       | 10768/41242 [35:00<1:40:00,  5.08it/s, training_loss=0.389]
Epoch 1:  26%|██▌       | 10769/41242 [35:00<1:39:22,  5.11it/s, training_loss=0.389]
Epoch 1:  26%|██▌       | 10769/41242 [35:00<1:39:22,  5.11it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10770/41242 [35:00<1:37:52,  5.19it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10770/41242 [35:00<1:37:52,  5.19it/s, training_loss=0.005]
Epoch 1:  26%|██▌       | 10771/41242 [35:00<1:37:16,  5.22it/s, training_loss=0.005]
Epoch 1:  26%|██▌       | 10771/41242 [35:01<1:37:16,  5.22it/s, training_loss=0.041]
Epoch 1:  26%|██▌       | 10772/41242 [35:01<1:38:34,  5.15it/s, training_loss=0.041]
Epoch 1:  26%|██▌       | 10772/41242 [35:01<1:38:34,  5.15it/s, training_loss=0.415]
Epoch 1:  26%|██▌       | 10773/41242 [35:01<1:38:06,  5.18it/s, training_loss=0.415]
Epoch 1:  26%|██▌       | 10773/41242 [35:01<1:38:06,  5.18it/s, training_loss=0.050]
Epoch 1:  26%|██▌       | 10774/41242 [35:01<1:39:40,  5.09it/s, training_loss=0.050]
Epoch 1:  26%|██▌       | 10774/41242 [35:01<1:39:40,  5.09it/s, training_loss=0.017]
Epoch 1:  26%|██▌       | 10775/41242 [35:01<1:38:55,  5.13it/s, training_loss=0.017]
Epoch 1:  26%|██▌       | 10775/41242 [35:01<1:38:55,  5.13it/s, training_loss=0.011]
Epoch 1:  26%|██▌       | 10776/41242 [35:01<1:38:20,  5.16it/s, training_loss=0.011]
Epoch 1:  26%|██▌       | 10776/41242 [35:02<1:38:20,  5.16it/s, training_loss=0.070]
Epoch 1:  26%|██▌       | 10777/41242 [35:02<1:39:44,  5.09it/s, training_loss=0.070]
Epoch 1:  26%|██▌       | 10777/41242 [35:02<1:39:44,  5.09it/s, training_loss=0.091]
Epoch 1:  26%|██▌       | 10778/41242 [35:02<1:39:53,  5.08it/s, training_loss=0.091]
Epoch 1:  26%|██▌       | 10778/41242 [35:02<1:39:53,  5.08it/s, training_loss=0.031]
Epoch 1:  26%|██▌       | 10779/41242 [35:02<1:39:21,  5.11it/s, training_loss=0.031]
Epoch 1:  26%|██▌       | 10779/41242 [35:02<1:39:21,  5.11it/s, training_loss=0.006]
Epoch 1:  26%|██▌       | 10780/41242 [35:02<1:39:59,  5.08it/s, training_loss=0.006]
Epoch 1:  26%|██▌       | 10780/41242 [35:02<1:39:59,  5.08it/s, training_loss=0.036]
Epoch 1:  26%|██▌       | 10781/41242 [35:02<1:40:10,  5.07it/s, training_loss=0.036]
Epoch 1:  26%|██▌       | 10781/41242 [35:03<1:40:10,  5.07it/s, training_loss=0.207]
Epoch 1:  26%|██▌       | 10782/41242 [35:03<1:39:49,  5.09it/s, training_loss=0.207]
Epoch 1:  26%|██▌       | 10782/41242 [35:03<1:39:49,  5.09it/s, training_loss=0.009]
Epoch 1:  26%|██▌       | 10783/41242 [35:03<1:40:52,  5.03it/s, training_loss=0.009]
Epoch 1:  26%|██▌       | 10783/41242 [35:03<1:40:52,  5.03it/s, training_loss=0.006]
Epoch 1:  26%|██▌       | 10784/41242 [35:03<1:40:13,  5.06it/s, training_loss=0.006]
Epoch 1:  26%|██▌       | 10784/41242 [35:03<1:40:13,  5.06it/s, training_loss=0.218]
Epoch 1:  26%|██▌       | 10785/41242 [35:03<1:39:18,  5.11it/s, training_loss=0.218]
Epoch 1:  26%|██▌       | 10785/41242 [35:03<1:39:18,  5.11it/s, training_loss=0.041]
Epoch 1:  26%|██▌       | 10786/41242 [35:03<1:38:51,  5.14it/s, training_loss=0.041]
Epoch 1:  26%|██▌       | 10786/41242 [35:04<1:38:51,  5.14it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10787/41242 [35:04<1:37:27,  5.21it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10787/41242 [35:04<1:37:27,  5.21it/s, training_loss=0.225]
Epoch 1:  26%|██▌       | 10788/41242 [35:04<1:37:08,  5.22it/s, training_loss=0.225]
Epoch 1:  26%|██▌       | 10788/41242 [35:04<1:37:08,  5.22it/s, training_loss=0.008]
Epoch 1:  26%|██▌       | 10789/41242 [35:04<1:36:35,  5.25it/s, training_loss=0.008]
Epoch 1:  26%|██▌       | 10789/41242 [35:04<1:36:35,  5.25it/s, training_loss=0.005]
Epoch 1:  26%|██▌       | 10790/41242 [35:04<1:36:51,  5.24it/s, training_loss=0.005]
Epoch 1:  26%|██▌       | 10790/41242 [35:04<1:36:51,  5.24it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10791/41242 [35:04<1:37:56,  5.18it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10791/41242 [35:05<1:37:56,  5.18it/s, training_loss=0.020]
Epoch 1:  26%|██▌       | 10792/41242 [35:05<1:38:32,  5.15it/s, training_loss=0.020]
Epoch 1:  26%|██▌       | 10792/41242 [35:05<1:38:32,  5.15it/s, training_loss=0.553]
Epoch 1:  26%|██▌       | 10793/41242 [35:05<1:39:54,  5.08it/s, training_loss=0.553]
Epoch 1:  26%|██▌       | 10793/41242 [35:05<1:39:54,  5.08it/s, training_loss=0.053]
Epoch 1:  26%|██▌       | 10794/41242 [35:05<1:39:06,  5.12it/s, training_loss=0.053]
Epoch 1:  26%|██▌       | 10794/41242 [35:05<1:39:06,  5.12it/s, training_loss=0.064]
Epoch 1:  26%|██▌       | 10795/41242 [35:05<1:38:33,  5.15it/s, training_loss=0.064]
Epoch 1:  26%|██▌       | 10795/41242 [35:05<1:38:33,  5.15it/s, training_loss=0.112]
Epoch 1:  26%|██▌       | 10796/41242 [35:05<1:37:52,  5.18it/s, training_loss=0.112]
Epoch 1:  26%|██▌       | 10796/41242 [35:05<1:37:52,  5.18it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10797/41242 [35:06<1:39:04,  5.12it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10797/41242 [35:06<1:39:04,  5.12it/s, training_loss=0.058]
Epoch 1:  26%|██▌       | 10798/41242 [35:06<1:39:57,  5.08it/s, training_loss=0.058]
Epoch 1:  26%|██▌       | 10798/41242 [35:06<1:39:57,  5.08it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10799/41242 [35:06<1:39:01,  5.12it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10799/41242 [35:06<1:39:01,  5.12it/s, training_loss=0.124]
Epoch 1:  26%|██▌       | 10800/41242 [35:06<1:37:58,  5.18it/s, training_loss=0.124]
Epoch 1:  26%|██▌       | 10800/41242 [35:06<1:37:58,  5.18it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10801/41242 [35:06<1:40:03,  5.07it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10801/41242 [35:06<1:40:03,  5.07it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10802/41242 [35:06<1:40:05,  5.07it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10802/41242 [35:07<1:40:05,  5.07it/s, training_loss=0.039]
Epoch 1:  26%|██▌       | 10803/41242 [35:07<1:40:10,  5.06it/s, training_loss=0.039]
Epoch 1:  26%|██▌       | 10803/41242 [35:07<1:40:10,  5.06it/s, training_loss=0.017]
Epoch 1:  26%|██▌       | 10804/41242 [35:07<1:39:22,  5.10it/s, training_loss=0.017]
Epoch 1:  26%|██▌       | 10804/41242 [35:07<1:39:22,  5.10it/s, training_loss=0.008]
Epoch 1:  26%|██▌       | 10805/41242 [35:07<1:38:46,  5.14it/s, training_loss=0.008]
Epoch 1:  26%|██▌       | 10805/41242 [35:07<1:38:46,  5.14it/s, training_loss=0.019]
Epoch 1:  26%|██▌       | 10806/41242 [35:07<1:38:10,  5.17it/s, training_loss=0.019]
Epoch 1:  26%|██▌       | 10806/41242 [35:07<1:38:10,  5.17it/s, training_loss=0.006]
Epoch 1:  26%|██▌       | 10807/41242 [35:07<1:36:56,  5.23it/s, training_loss=0.006]
Epoch 1:  26%|██▌       | 10807/41242 [35:08<1:36:56,  5.23it/s, training_loss=0.015]
Epoch 1:  26%|██▌       | 10808/41242 [35:08<1:36:51,  5.24it/s, training_loss=0.015]
Epoch 1:  26%|██▌       | 10808/41242 [35:08<1:36:51,  5.24it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10809/41242 [35:08<1:36:00,  5.28it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10809/41242 [35:08<1:36:00,  5.28it/s, training_loss=0.003]
Epoch 1:  26%|██▌       | 10810/41242 [35:08<1:38:11,  5.17it/s, training_loss=0.003]
Epoch 1:  26%|██▌       | 10810/41242 [35:08<1:38:11,  5.17it/s, training_loss=0.031]
Epoch 1:  26%|██▌       | 10811/41242 [35:08<1:37:23,  5.21it/s, training_loss=0.031]
Epoch 1:  26%|██▌       | 10811/41242 [35:08<1:37:23,  5.21it/s, training_loss=0.023]
Epoch 1:  26%|██▌       | 10812/41242 [35:08<1:37:33,  5.20it/s, training_loss=0.023]
Epoch 1:  26%|██▌       | 10812/41242 [35:09<1:37:33,  5.20it/s, training_loss=0.006]
Epoch 1:  26%|██▌       | 10813/41242 [35:09<1:36:28,  5.26it/s, training_loss=0.006]
Epoch 1:  26%|██▌       | 10813/41242 [35:09<1:36:28,  5.26it/s, training_loss=0.070]
Epoch 1:  26%|██▌       | 10814/41242 [35:09<1:36:42,  5.24it/s, training_loss=0.070]
Epoch 1:  26%|██▌       | 10814/41242 [35:09<1:36:42,  5.24it/s, training_loss=0.003]
Epoch 1:  26%|██▌       | 10815/41242 [35:09<1:36:09,  5.27it/s, training_loss=0.003]
Epoch 1:  26%|██▌       | 10815/41242 [35:09<1:36:09,  5.27it/s, training_loss=0.011]
Epoch 1:  26%|██▌       | 10816/41242 [35:09<1:36:12,  5.27it/s, training_loss=0.011]
Epoch 1:  26%|██▌       | 10816/41242 [35:09<1:36:12,  5.27it/s, training_loss=0.489]
Epoch 1:  26%|██▌       | 10817/41242 [35:09<1:36:36,  5.25it/s, training_loss=0.489]
Epoch 1:  26%|██▌       | 10817/41242 [35:10<1:36:36,  5.25it/s, training_loss=0.444]
Epoch 1:  26%|██▌       | 10818/41242 [35:10<1:37:59,  5.17it/s, training_loss=0.444]
Epoch 1:  26%|██▌       | 10818/41242 [35:10<1:37:59,  5.17it/s, training_loss=0.017]
Epoch 1:  26%|██▌       | 10819/41242 [35:10<1:37:34,  5.20it/s, training_loss=0.017]
Epoch 1:  26%|██▌       | 10819/41242 [35:10<1:37:34,  5.20it/s, training_loss=0.011]
Epoch 1:  26%|██▌       | 10820/41242 [35:10<1:37:09,  5.22it/s, training_loss=0.011]
Epoch 1:  26%|██▌       | 10820/41242 [35:10<1:37:09,  5.22it/s, training_loss=0.015]
Epoch 1:  26%|██▌       | 10821/41242 [35:10<1:37:03,  5.22it/s, training_loss=0.015]
Epoch 1:  26%|██▌       | 10821/41242 [35:10<1:37:03,  5.22it/s, training_loss=0.629]
Epoch 1:  26%|██▌       | 10822/41242 [35:10<1:36:55,  5.23it/s, training_loss=0.629]
Epoch 1:  26%|██▌       | 10822/41242 [35:11<1:36:55,  5.23it/s, training_loss=0.430]
Epoch 1:  26%|██▌       | 10823/41242 [35:11<1:40:46,  5.03it/s, training_loss=0.430]
Epoch 1:  26%|██▌       | 10823/41242 [35:11<1:40:46,  5.03it/s, training_loss=0.040]
Epoch 1:  26%|██▌       | 10824/41242 [35:11<1:41:51,  4.98it/s, training_loss=0.040]
Epoch 1:  26%|██▌       | 10824/41242 [35:11<1:41:51,  4.98it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10825/41242 [35:11<1:40:45,  5.03it/s, training_loss=0.004]
Epoch 1:  26%|██▌       | 10825/41242 [35:11<1:40:45,  5.03it/s, training_loss=0.016]
Epoch 1:  26%|██▌       | 10826/41242 [35:11<1:39:09,  5.11it/s, training_loss=0.016]
Epoch 1:  26%|██▌       | 10826/41242 [35:11<1:39:09,  5.11it/s, training_loss=0.035]
Epoch 1:  26%|██▋       | 10827/41242 [35:11<1:38:31,  5.15it/s, training_loss=0.035]
Epoch 1:  26%|██▋       | 10827/41242 [35:12<1:38:31,  5.15it/s, training_loss=0.176]
Epoch 1:  26%|██▋       | 10828/41242 [35:12<1:38:42,  5.14it/s, training_loss=0.176]
Epoch 1:  26%|██▋       | 10828/41242 [35:12<1:38:42,  5.14it/s, training_loss=0.117]
Epoch 1:  26%|██▋       | 10829/41242 [35:12<1:38:19,  5.15it/s, training_loss=0.117]
Epoch 1:  26%|██▋       | 10829/41242 [35:12<1:38:19,  5.15it/s, training_loss=0.082]
Epoch 1:  26%|██▋       | 10830/41242 [35:12<1:37:33,  5.20it/s, training_loss=0.082]
Epoch 1:  26%|██▋       | 10830/41242 [35:12<1:37:33,  5.20it/s, training_loss=0.003]
Epoch 1:  26%|██▋       | 10831/41242 [35:12<1:37:05,  5.22it/s, training_loss=0.003]
Epoch 1:  26%|██▋       | 10831/41242 [35:12<1:37:05,  5.22it/s, training_loss=0.023]
Epoch 1:  26%|██▋       | 10832/41242 [35:12<1:39:22,  5.10it/s, training_loss=0.023]
Epoch 1:  26%|██▋       | 10832/41242 [35:12<1:39:22,  5.10it/s, training_loss=0.011]
Epoch 1:  26%|██▋       | 10833/41242 [35:12<1:41:21,  5.00it/s, training_loss=0.011]
Epoch 1:  26%|██▋       | 10833/41242 [35:13<1:41:21,  5.00it/s, training_loss=0.013]
Epoch 1:  26%|██▋       | 10834/41242 [35:13<1:38:52,  5.13it/s, training_loss=0.013]
Epoch 1:  26%|██▋       | 10834/41242 [35:13<1:38:52,  5.13it/s, training_loss=0.016]
Epoch 1:  26%|██▋       | 10835/41242 [35:13<1:38:19,  5.15it/s, training_loss=0.016]
Epoch 1:  26%|██▋       | 10835/41242 [35:13<1:38:19,  5.15it/s, training_loss=0.499]
Epoch 1:  26%|██▋       | 10836/41242 [35:13<1:38:19,  5.15it/s, training_loss=0.499]
Epoch 1:  26%|██▋       | 10836/41242 [35:13<1:38:19,  5.15it/s, training_loss=0.157]
Epoch 1:  26%|██▋       | 10837/41242 [35:13<1:39:55,  5.07it/s, training_loss=0.157]
Epoch 1:  26%|██▋       | 10837/41242 [35:13<1:39:55,  5.07it/s, training_loss=0.002]
Epoch 1:  26%|██▋       | 10838/41242 [35:13<1:40:02,  5.07it/s, training_loss=0.002]
Epoch 1:  26%|██▋       | 10838/41242 [35:14<1:40:02,  5.07it/s, training_loss=0.021]
Epoch 1:  26%|██▋       | 10839/41242 [35:14<1:40:57,  5.02it/s, training_loss=0.021]
Epoch 1:  26%|██▋       | 10839/41242 [35:14<1:40:57,  5.02it/s, training_loss=0.650]
Epoch 1:  26%|██▋       | 10840/41242 [35:14<1:41:56,  4.97it/s, training_loss=0.650]
Epoch 1:  26%|██▋       | 10840/41242 [35:14<1:41:56,  4.97it/s, training_loss=0.391]
Epoch 1:  26%|██▋       | 10841/41242 [35:14<1:42:43,  4.93it/s, training_loss=0.391]
Epoch 1:  26%|██▋       | 10841/41242 [35:14<1:42:43,  4.93it/s, training_loss=0.415]
Epoch 1:  26%|██▋       | 10842/41242 [35:14<1:44:04,  4.87it/s, training_loss=0.415]
Epoch 1:  26%|██▋       | 10842/41242 [35:14<1:44:04,  4.87it/s, training_loss=0.006]
Epoch 1:  26%|██▋       | 10843/41242 [35:14<1:42:44,  4.93it/s, training_loss=0.006]
Epoch 1:  26%|██▋       | 10843/41242 [35:15<1:42:44,  4.93it/s, training_loss=0.003]
Epoch 1:  26%|██▋       | 10844/41242 [35:15<1:42:34,  4.94it/s, training_loss=0.003]
Epoch 1:  26%|██▋       | 10844/41242 [35:15<1:42:34,  4.94it/s, training_loss=0.291]
Epoch 1:  26%|██▋       | 10845/41242 [35:15<1:41:20,  5.00it/s, training_loss=0.291]
Epoch 1:  26%|██▋       | 10845/41242 [35:15<1:41:20,  5.00it/s, training_loss=0.178]
Epoch 1:  26%|██▋       | 10846/41242 [35:15<1:39:17,  5.10it/s, training_loss=0.178]
Epoch 1:  26%|██▋       | 10846/41242 [35:15<1:39:17,  5.10it/s, training_loss=0.005]
Epoch 1:  26%|██▋       | 10847/41242 [35:15<1:38:14,  5.16it/s, training_loss=0.005]
Epoch 1:  26%|██▋       | 10847/41242 [35:15<1:38:14,  5.16it/s, training_loss=0.039]
Epoch 1:  26%|██▋       | 10848/41242 [35:15<1:37:48,  5.18it/s, training_loss=0.039]
Epoch 1:  26%|██▋       | 10848/41242 [35:16<1:37:48,  5.18it/s, training_loss=0.005]
Epoch 1:  26%|██▋       | 10849/41242 [35:16<1:37:55,  5.17it/s, training_loss=0.005]
Epoch 1:  26%|██▋       | 10849/41242 [35:16<1:37:55,  5.17it/s, training_loss=0.427]
Epoch 1:  26%|██▋       | 10850/41242 [35:16<1:39:52,  5.07it/s, training_loss=0.427]
Epoch 1:  26%|██▋       | 10850/41242 [35:16<1:39:52,  5.07it/s, training_loss=0.177]
Epoch 1:  26%|██▋       | 10851/41242 [35:16<1:38:45,  5.13it/s, training_loss=0.177]
Epoch 1:  26%|██▋       | 10851/41242 [35:16<1:38:45,  5.13it/s, training_loss=0.014]
Epoch 1:  26%|██▋       | 10852/41242 [35:16<1:37:36,  5.19it/s, training_loss=0.014]
Epoch 1:  26%|██▋       | 10852/41242 [35:16<1:37:36,  5.19it/s, training_loss=0.075]
Epoch 1:  26%|██▋       | 10853/41242 [35:16<1:37:43,  5.18it/s, training_loss=0.075]
Epoch 1:  26%|██▋       | 10853/41242 [35:17<1:37:43,  5.18it/s, training_loss=0.606]
Epoch 1:  26%|██▋       | 10854/41242 [35:17<1:37:48,  5.18it/s, training_loss=0.606]
Epoch 1:  26%|██▋       | 10854/41242 [35:17<1:37:48,  5.18it/s, training_loss=0.009]
Epoch 1:  26%|██▋       | 10855/41242 [35:17<1:38:12,  5.16it/s, training_loss=0.009]
Epoch 1:  26%|██▋       | 10855/41242 [35:17<1:38:12,  5.16it/s, training_loss=0.014]
Epoch 1:  26%|██▋       | 10856/41242 [35:17<1:37:24,  5.20it/s, training_loss=0.014]
Epoch 1:  26%|██▋       | 10856/41242 [35:17<1:37:24,  5.20it/s, training_loss=0.249]
Epoch 1:  26%|██▋       | 10857/41242 [35:17<1:37:05,  5.22it/s, training_loss=0.249]
Epoch 1:  26%|██▋       | 10857/41242 [35:17<1:37:05,  5.22it/s, training_loss=0.373]
Epoch 1:  26%|██▋       | 10858/41242 [35:17<1:36:39,  5.24it/s, training_loss=0.373]
Epoch 1:  26%|██▋       | 10858/41242 [35:18<1:36:39,  5.24it/s, training_loss=0.007]
Epoch 1:  26%|██▋       | 10859/41242 [35:18<1:35:27,  5.30it/s, training_loss=0.007]
Epoch 1:  26%|██▋       | 10859/41242 [35:18<1:35:27,  5.30it/s, training_loss=0.224]
Epoch 1:  26%|██▋       | 10860/41242 [35:18<1:36:22,  5.25it/s, training_loss=0.224]
Epoch 1:  26%|██▋       | 10860/41242 [35:18<1:36:22,  5.25it/s, training_loss=0.014]
Epoch 1:  26%|██▋       | 10861/41242 [35:18<1:37:48,  5.18it/s, training_loss=0.014]
Epoch 1:  26%|██▋       | 10861/41242 [35:18<1:37:48,  5.18it/s, training_loss=0.002]
Epoch 1:  26%|██▋       | 10862/41242 [35:18<1:38:14,  5.15it/s, training_loss=0.002]
Epoch 1:  26%|██▋       | 10862/41242 [35:18<1:38:14,  5.15it/s, training_loss=0.004]
Epoch 1:  26%|██▋       | 10863/41242 [35:18<1:36:51,  5.23it/s, training_loss=0.004]
Epoch 1:  26%|██▋       | 10863/41242 [35:19<1:36:51,  5.23it/s, training_loss=0.003]
Epoch 1:  26%|██▋       | 10864/41242 [35:19<1:36:43,  5.23it/s, training_loss=0.003]
Epoch 1:  26%|██▋       | 10864/41242 [35:19<1:36:43,  5.23it/s, training_loss=0.004]
Epoch 1:  26%|██▋       | 10865/41242 [35:19<1:36:24,  5.25it/s, training_loss=0.004]
Epoch 1:  26%|██▋       | 10865/41242 [35:19<1:36:24,  5.25it/s, training_loss=0.007]
Epoch 1:  26%|██▋       | 10866/41242 [35:19<1:35:43,  5.29it/s, training_loss=0.007]
Epoch 1:  26%|██▋       | 10866/41242 [35:19<1:35:43,  5.29it/s, training_loss=0.015]
Epoch 1:  26%|██▋       | 10867/41242 [35:19<1:37:15,  5.21it/s, training_loss=0.015]
Epoch 1:  26%|██▋       | 10867/41242 [35:19<1:37:15,  5.21it/s, training_loss=0.453]
Epoch 1:  26%|██▋       | 10868/41242 [35:19<1:36:41,  5.24it/s, training_loss=0.453]
Epoch 1:  26%|██▋       | 10868/41242 [35:19<1:36:41,  5.24it/s, training_loss=0.004]
Epoch 1:  26%|██▋       | 10869/41242 [35:19<1:35:57,  5.28it/s, training_loss=0.004]
Epoch 1:  26%|██▋       | 10869/41242 [35:20<1:35:57,  5.28it/s, training_loss=0.027]
Epoch 1:  26%|██▋       | 10870/41242 [35:20<1:38:13,  5.15it/s, training_loss=0.027]
Epoch 1:  26%|██▋       | 10870/41242 [35:20<1:38:13,  5.15it/s, training_loss=0.010]
Epoch 1:  26%|██▋       | 10871/41242 [35:20<1:40:07,  5.06it/s, training_loss=0.010]
Epoch 1:  26%|██▋       | 10871/41242 [35:20<1:40:07,  5.06it/s, training_loss=0.005]
Epoch 1:  26%|██▋       | 10872/41242 [35:20<1:38:52,  5.12it/s, training_loss=0.005]
Epoch 1:  26%|██▋       | 10872/41242 [35:20<1:38:52,  5.12it/s, training_loss=0.057]
Epoch 1:  26%|██▋       | 10873/41242 [35:20<1:38:17,  5.15it/s, training_loss=0.057]
Epoch 1:  26%|██▋       | 10873/41242 [35:20<1:38:17,  5.15it/s, training_loss=0.006]
Epoch 1:  26%|██▋       | 10874/41242 [35:20<1:37:50,  5.17it/s, training_loss=0.006]
Epoch 1:  26%|██▋       | 10874/41242 [35:21<1:37:50,  5.17it/s, training_loss=0.007]
Epoch 1:  26%|██▋       | 10875/41242 [35:21<1:36:11,  5.26it/s, training_loss=0.007]
Epoch 1:  26%|██▋       | 10875/41242 [35:21<1:36:11,  5.26it/s, training_loss=0.044]
Epoch 1:  26%|██▋       | 10876/41242 [35:21<1:40:02,  5.06it/s, training_loss=0.044]
Epoch 1:  26%|██▋       | 10876/41242 [35:21<1:40:02,  5.06it/s, training_loss=0.370]
Epoch 1:  26%|██▋       | 10877/41242 [35:21<1:40:21,  5.04it/s, training_loss=0.370]
Epoch 1:  26%|██▋       | 10877/41242 [35:21<1:40:21,  5.04it/s, training_loss=0.288]
Epoch 1:  26%|██▋       | 10878/41242 [35:21<1:40:27,  5.04it/s, training_loss=0.288]
Epoch 1:  26%|██▋       | 10878/41242 [35:21<1:40:27,  5.04it/s, training_loss=0.003]
Epoch 1:  26%|██▋       | 10879/41242 [35:21<1:40:50,  5.02it/s, training_loss=0.003]
Epoch 1:  26%|██▋       | 10879/41242 [35:22<1:40:50,  5.02it/s, training_loss=0.941]
Epoch 1:  26%|██▋       | 10880/41242 [35:22<1:41:43,  4.97it/s, training_loss=0.941]
Epoch 1:  26%|██▋       | 10880/41242 [35:22<1:41:43,  4.97it/s, training_loss=0.003]
Epoch 1:  26%|██▋       | 10881/41242 [35:22<1:40:38,  5.03it/s, training_loss=0.003]
Epoch 1:  26%|██▋       | 10881/41242 [35:22<1:40:38,  5.03it/s, training_loss=0.036]
Epoch 1:  26%|██▋       | 10882/41242 [35:22<1:39:28,  5.09it/s, training_loss=0.036]
Epoch 1:  26%|██▋       | 10882/41242 [35:22<1:39:28,  5.09it/s, training_loss=0.057]
Epoch 1:  26%|██▋       | 10883/41242 [35:22<1:38:14,  5.15it/s, training_loss=0.057]
Epoch 1:  26%|██▋       | 10883/41242 [35:22<1:38:14,  5.15it/s, training_loss=0.263]
Epoch 1:  26%|██▋       | 10884/41242 [35:22<1:38:12,  5.15it/s, training_loss=0.263]
Epoch 1:  26%|██▋       | 10884/41242 [35:23<1:38:12,  5.15it/s, training_loss=0.013]
Epoch 1:  26%|██▋       | 10885/41242 [35:23<1:37:47,  5.17it/s, training_loss=0.013]
Epoch 1:  26%|██▋       | 10885/41242 [35:23<1:37:47,  5.17it/s, training_loss=0.005]
Epoch 1:  26%|██▋       | 10886/41242 [35:23<1:36:48,  5.23it/s, training_loss=0.005]
Epoch 1:  26%|██▋       | 10886/41242 [35:23<1:36:48,  5.23it/s, training_loss=0.050]
Epoch 1:  26%|██▋       | 10887/41242 [35:23<1:36:19,  5.25it/s, training_loss=0.050]
Epoch 1:  26%|██▋       | 10887/41242 [35:23<1:36:19,  5.25it/s, training_loss=0.007]
Epoch 1:  26%|██▋       | 10888/41242 [35:23<1:35:22,  5.30it/s, training_loss=0.007]
Epoch 1:  26%|██▋       | 10888/41242 [35:23<1:35:22,  5.30it/s, training_loss=0.018]
Epoch 1:  26%|██▋       | 10889/41242 [35:23<1:37:38,  5.18it/s, training_loss=0.018]
Epoch 1:  26%|██▋       | 10889/41242 [35:24<1:37:38,  5.18it/s, training_loss=0.245]
Epoch 1:  26%|██▋       | 10890/41242 [35:24<1:37:23,  5.19it/s, training_loss=0.245]
Epoch 1:  26%|██▋       | 10890/41242 [35:24<1:37:23,  5.19it/s, training_loss=0.213]
Epoch 1:  26%|██▋       | 10891/41242 [35:24<1:37:53,  5.17it/s, training_loss=0.213]
Epoch 1:  26%|██▋       | 10891/41242 [35:24<1:37:53,  5.17it/s, training_loss=0.124]
Epoch 1:  26%|██▋       | 10892/41242 [35:24<1:37:30,  5.19it/s, training_loss=0.124]
Epoch 1:  26%|██▋       | 10892/41242 [35:24<1:37:30,  5.19it/s, training_loss=0.072]
Epoch 1:  26%|██▋       | 10893/41242 [35:24<1:37:22,  5.19it/s, training_loss=0.072]
Epoch 1:  26%|██▋       | 10893/41242 [35:24<1:37:22,  5.19it/s, training_loss=0.014]
Epoch 1:  26%|██▋       | 10894/41242 [35:24<1:40:27,  5.04it/s, training_loss=0.014]
Epoch 1:  26%|██▋       | 10894/41242 [35:25<1:40:27,  5.04it/s, training_loss=0.004]
Epoch 1:  26%|██▋       | 10895/41242 [35:25<1:39:43,  5.07it/s, training_loss=0.004]
Epoch 1:  26%|██▋       | 10895/41242 [35:25<1:39:43,  5.07it/s, training_loss=0.118]
Epoch 1:  26%|██▋       | 10896/41242 [35:25<1:39:15,  5.10it/s, training_loss=0.118]
Epoch 1:  26%|██▋       | 10896/41242 [35:25<1:39:15,  5.10it/s, training_loss=0.017]
Epoch 1:  26%|██▋       | 10897/41242 [35:25<1:38:16,  5.15it/s, training_loss=0.017]
Epoch 1:  26%|██▋       | 10897/41242 [35:25<1:38:16,  5.15it/s, training_loss=0.065]
Epoch 1:  26%|██▋       | 10898/41242 [35:25<1:37:31,  5.19it/s, training_loss=0.065]
Epoch 1:  26%|██▋       | 10898/41242 [35:25<1:37:31,  5.19it/s, training_loss=0.006]
Epoch 1:  26%|██▋       | 10899/41242 [35:25<1:36:58,  5.21it/s, training_loss=0.006]
Epoch 1:  26%|██▋       | 10899/41242 [35:26<1:36:58,  5.21it/s, training_loss=0.045]
Epoch 1:  26%|██▋       | 10900/41242 [35:26<1:36:35,  5.24it/s, training_loss=0.045]
Epoch 1:  26%|██▋       | 10900/41242 [35:26<1:36:35,  5.24it/s, training_loss=0.004]
Epoch 1:  26%|██▋       | 10901/41242 [35:26<1:37:50,  5.17it/s, training_loss=0.004]
Epoch 1:  26%|██▋       | 10901/41242 [35:26<1:37:50,  5.17it/s, training_loss=0.003]
Epoch 1:  26%|██▋       | 10902/41242 [35:26<1:39:20,  5.09it/s, training_loss=0.003]
Epoch 1:  26%|██▋       | 10902/41242 [35:26<1:39:20,  5.09it/s, training_loss=0.006]
Epoch 1:  26%|██▋       | 10903/41242 [35:26<1:38:57,  5.11it/s, training_loss=0.006]
Epoch 1:  26%|██▋       | 10903/41242 [35:26<1:38:57,  5.11it/s, training_loss=0.004]
Epoch 1:  26%|██▋       | 10904/41242 [35:26<1:37:49,  5.17it/s, training_loss=0.004]
Epoch 1:  26%|██▋       | 10904/41242 [35:26<1:37:49,  5.17it/s, training_loss=0.006]
Epoch 1:  26%|██▋       | 10905/41242 [35:26<1:36:58,  5.21it/s, training_loss=0.006]
Epoch 1:  26%|██▋       | 10905/41242 [35:27<1:36:58,  5.21it/s, training_loss=0.037]
Epoch 1:  26%|██▋       | 10906/41242 [35:27<1:39:24,  5.09it/s, training_loss=0.037]
Epoch 1:  26%|██▋       | 10906/41242 [35:27<1:39:24,  5.09it/s, training_loss=0.008]
Epoch 1:  26%|██▋       | 10907/41242 [35:27<1:41:01,  5.00it/s, training_loss=0.008]
Epoch 1:  26%|██▋       | 10907/41242 [35:27<1:41:01,  5.00it/s, training_loss=0.003]
Epoch 1:  26%|██▋       | 10908/41242 [35:27<1:40:54,  5.01it/s, training_loss=0.003]
Epoch 1:  26%|██▋       | 10908/41242 [35:27<1:40:54,  5.01it/s, training_loss=0.133]
Epoch 1:  26%|██▋       | 10909/41242 [35:27<1:40:50,  5.01it/s, training_loss=0.133]
Epoch 1:  26%|██▋       | 10909/41242 [35:28<1:40:50,  5.01it/s, training_loss=0.047]
Epoch 1:  26%|██▋       | 10910/41242 [35:28<1:42:36,  4.93it/s, training_loss=0.047]
Epoch 1:  26%|██▋       | 10910/41242 [35:28<1:42:36,  4.93it/s, training_loss=0.345]
Epoch 1:  26%|██▋       | 10911/41242 [35:28<1:41:24,  4.99it/s, training_loss=0.345]
Epoch 1:  26%|██▋       | 10911/41242 [35:28<1:41:24,  4.99it/s, training_loss=0.010]
Epoch 1:  26%|██▋       | 10912/41242 [35:28<1:41:38,  4.97it/s, training_loss=0.010]
Epoch 1:  26%|██▋       | 10912/41242 [35:28<1:41:38,  4.97it/s, training_loss=0.477]
Epoch 1:  26%|██▋       | 10913/41242 [35:28<1:42:00,  4.95it/s, training_loss=0.477]
Epoch 1:  26%|██▋       | 10913/41242 [35:28<1:42:00,  4.95it/s, training_loss=0.010]
Epoch 1:  26%|██▋       | 10914/41242 [35:28<1:40:44,  5.02it/s, training_loss=0.010]
Epoch 1:  26%|██▋       | 10914/41242 [35:28<1:40:44,  5.02it/s, training_loss=0.029]
Epoch 1:  26%|██▋       | 10915/41242 [35:29<1:41:20,  4.99it/s, training_loss=0.029]
Epoch 1:  26%|██▋       | 10915/41242 [35:29<1:41:20,  4.99it/s, training_loss=0.014]
Epoch 1:  26%|██▋       | 10916/41242 [35:29<1:40:29,  5.03it/s, training_loss=0.014]
Epoch 1:  26%|██▋       | 10916/41242 [35:29<1:40:29,  5.03it/s, training_loss=0.029]
Epoch 1:  26%|██▋       | 10917/41242 [35:29<1:41:35,  4.97it/s, training_loss=0.029]
Epoch 1:  26%|██▋       | 10917/41242 [35:29<1:41:35,  4.97it/s, training_loss=0.009]
Epoch 1:  26%|██▋       | 10918/41242 [35:29<1:40:13,  5.04it/s, training_loss=0.009]
Epoch 1:  26%|██▋       | 10918/41242 [35:29<1:40:13,  5.04it/s, training_loss=0.003]
Epoch 1:  26%|██▋       | 10919/41242 [35:29<1:38:47,  5.12it/s, training_loss=0.003]
Epoch 1:  26%|██▋       | 10919/41242 [35:29<1:38:47,  5.12it/s, training_loss=0.036]
Epoch 1:  26%|██▋       | 10920/41242 [35:30<1:41:19,  4.99it/s, training_loss=0.036]
Epoch 1:  26%|██▋       | 10920/41242 [35:30<1:41:19,  4.99it/s, training_loss=0.010]
Epoch 1:  26%|██▋       | 10921/41242 [35:30<1:41:03,  5.00it/s, training_loss=0.010]
Epoch 1:  26%|██▋       | 10921/41242 [35:30<1:41:03,  5.00it/s, training_loss=0.118]
Epoch 1:  26%|██▋       | 10922/41242 [35:30<1:40:26,  5.03it/s, training_loss=0.118]
Epoch 1:  26%|██▋       | 10922/41242 [35:30<1:40:26,  5.03it/s, training_loss=0.004]
Epoch 1:  26%|██▋       | 10923/41242 [35:30<1:38:53,  5.11it/s, training_loss=0.004]
Epoch 1:  26%|██▋       | 10923/41242 [35:30<1:38:53,  5.11it/s, training_loss=0.008]
Epoch 1:  26%|██▋       | 10924/41242 [35:30<1:38:45,  5.12it/s, training_loss=0.008]
Epoch 1:  26%|██▋       | 10924/41242 [35:30<1:38:45,  5.12it/s, training_loss=0.652]
Epoch 1:  26%|██▋       | 10925/41242 [35:30<1:38:01,  5.15it/s, training_loss=0.652]
Epoch 1:  26%|██▋       | 10925/41242 [35:31<1:38:01,  5.15it/s, training_loss=0.407]
Epoch 1:  26%|██▋       | 10926/41242 [35:31<1:38:16,  5.14it/s, training_loss=0.407]
Epoch 1:  26%|██▋       | 10926/41242 [35:31<1:38:16,  5.14it/s, training_loss=0.241]
Epoch 1:  26%|██▋       | 10927/41242 [35:31<1:38:01,  5.15it/s, training_loss=0.241]
Epoch 1:  26%|██▋       | 10927/41242 [35:31<1:38:01,  5.15it/s, training_loss=0.004]
Epoch 1:  26%|██▋       | 10928/41242 [35:31<1:37:03,  5.21it/s, training_loss=0.004]
Epoch 1:  26%|██▋       | 10928/41242 [35:31<1:37:03,  5.21it/s, training_loss=0.006]
Epoch 1:  26%|██▋       | 10929/41242 [35:31<1:35:52,  5.27it/s, training_loss=0.006]
Epoch 1:  26%|██▋       | 10929/41242 [35:31<1:35:52,  5.27it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 10930/41242 [35:31<1:35:40,  5.28it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 10930/41242 [35:32<1:35:40,  5.28it/s, training_loss=0.016]
Epoch 1:  27%|██▋       | 10931/41242 [35:32<1:35:55,  5.27it/s, training_loss=0.016]
Epoch 1:  27%|██▋       | 10931/41242 [35:32<1:35:55,  5.27it/s, training_loss=0.011]
Epoch 1:  27%|██▋       | 10932/41242 [35:32<1:35:54,  5.27it/s, training_loss=0.011]
Epoch 1:  27%|██▋       | 10932/41242 [35:32<1:35:54,  5.27it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 10933/41242 [35:32<1:35:16,  5.30it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 10933/41242 [35:32<1:35:16,  5.30it/s, training_loss=0.010]
Epoch 1:  27%|██▋       | 10934/41242 [35:32<1:37:16,  5.19it/s, training_loss=0.010]
Epoch 1:  27%|██▋       | 10934/41242 [35:32<1:37:16,  5.19it/s, training_loss=0.425]
Epoch 1:  27%|██▋       | 10935/41242 [35:32<1:36:44,  5.22it/s, training_loss=0.425]
Epoch 1:  27%|██▋       | 10935/41242 [35:33<1:36:44,  5.22it/s, training_loss=0.712]
Epoch 1:  27%|██▋       | 10936/41242 [35:33<1:36:56,  5.21it/s, training_loss=0.712]
Epoch 1:  27%|██▋       | 10936/41242 [35:33<1:36:56,  5.21it/s, training_loss=0.023]
Epoch 1:  27%|██▋       | 10937/41242 [35:33<1:36:27,  5.24it/s, training_loss=0.023]
Epoch 1:  27%|██▋       | 10937/41242 [35:33<1:36:27,  5.24it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 10938/41242 [35:33<1:35:46,  5.27it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 10938/41242 [35:33<1:35:46,  5.27it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 10939/41242 [35:33<1:35:50,  5.27it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 10939/41242 [35:33<1:35:50,  5.27it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 10940/41242 [35:33<1:35:13,  5.30it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 10940/41242 [35:34<1:35:13,  5.30it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 10941/41242 [35:34<1:37:01,  5.20it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 10941/41242 [35:34<1:37:01,  5.20it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 10942/41242 [35:34<1:36:44,  5.22it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 10942/41242 [35:34<1:36:44,  5.22it/s, training_loss=0.370]
Epoch 1:  27%|██▋       | 10943/41242 [35:34<1:37:10,  5.20it/s, training_loss=0.370]
Epoch 1:  27%|██▋       | 10943/41242 [35:34<1:37:10,  5.20it/s, training_loss=0.173]
Epoch 1:  27%|██▋       | 10944/41242 [35:34<1:37:19,  5.19it/s, training_loss=0.173]
Epoch 1:  27%|██▋       | 10944/41242 [35:34<1:37:19,  5.19it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 10945/41242 [35:34<1:37:36,  5.17it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 10945/41242 [35:34<1:37:36,  5.17it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 10946/41242 [35:34<1:36:56,  5.21it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 10946/41242 [35:35<1:36:56,  5.21it/s, training_loss=0.177]
Epoch 1:  27%|██▋       | 10947/41242 [35:35<1:36:49,  5.22it/s, training_loss=0.177]
Epoch 1:  27%|██▋       | 10947/41242 [35:35<1:36:49,  5.22it/s, training_loss=0.026]
Epoch 1:  27%|██▋       | 10948/41242 [35:35<1:37:10,  5.20it/s, training_loss=0.026]
Epoch 1:  27%|██▋       | 10948/41242 [35:35<1:37:10,  5.20it/s, training_loss=0.099]
Epoch 1:  27%|██▋       | 10949/41242 [35:35<1:36:59,  5.21it/s, training_loss=0.099]
Epoch 1:  27%|██▋       | 10949/41242 [35:35<1:36:59,  5.21it/s, training_loss=0.012]
Epoch 1:  27%|██▋       | 10950/41242 [35:35<1:36:48,  5.22it/s, training_loss=0.012]
Epoch 1:  27%|██▋       | 10950/41242 [35:35<1:36:48,  5.22it/s, training_loss=0.263]
Epoch 1:  27%|██▋       | 10951/41242 [35:35<1:37:04,  5.20it/s, training_loss=0.263]
Epoch 1:  27%|██▋       | 10951/41242 [35:36<1:37:04,  5.20it/s, training_loss=0.012]
Epoch 1:  27%|██▋       | 10952/41242 [35:36<1:37:25,  5.18it/s, training_loss=0.012]
Epoch 1:  27%|██▋       | 10952/41242 [35:36<1:37:25,  5.18it/s, training_loss=0.375]
Epoch 1:  27%|██▋       | 10953/41242 [35:36<1:36:56,  5.21it/s, training_loss=0.375]
Epoch 1:  27%|██▋       | 10953/41242 [35:36<1:36:56,  5.21it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 10954/41242 [35:36<1:36:03,  5.26it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 10954/41242 [35:36<1:36:03,  5.26it/s, training_loss=0.063]
Epoch 1:  27%|██▋       | 10955/41242 [35:36<1:36:06,  5.25it/s, training_loss=0.063]
Epoch 1:  27%|██▋       | 10955/41242 [35:36<1:36:06,  5.25it/s, training_loss=0.010]
Epoch 1:  27%|██▋       | 10956/41242 [35:36<1:36:10,  5.25it/s, training_loss=0.010]
Epoch 1:  27%|██▋       | 10956/41242 [35:37<1:36:10,  5.25it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 10957/41242 [35:37<1:37:45,  5.16it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 10957/41242 [35:37<1:37:45,  5.16it/s, training_loss=0.128]
Epoch 1:  27%|██▋       | 10958/41242 [35:37<1:38:10,  5.14it/s, training_loss=0.128]
Epoch 1:  27%|██▋       | 10958/41242 [35:37<1:38:10,  5.14it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 10959/41242 [35:37<1:37:57,  5.15it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 10959/41242 [35:37<1:37:57,  5.15it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 10960/41242 [35:37<1:37:13,  5.19it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 10960/41242 [35:37<1:37:13,  5.19it/s, training_loss=0.010]
Epoch 1:  27%|██▋       | 10961/41242 [35:37<1:36:44,  5.22it/s, training_loss=0.010]
Epoch 1:  27%|██▋       | 10961/41242 [35:38<1:36:44,  5.22it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 10962/41242 [35:38<1:36:29,  5.23it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 10962/41242 [35:38<1:36:29,  5.23it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 10963/41242 [35:38<1:35:46,  5.27it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 10963/41242 [35:38<1:35:46,  5.27it/s, training_loss=0.172]
Epoch 1:  27%|██▋       | 10964/41242 [35:38<1:35:56,  5.26it/s, training_loss=0.172]
Epoch 1:  27%|██▋       | 10964/41242 [35:38<1:35:56,  5.26it/s, training_loss=0.540]
Epoch 1:  27%|██▋       | 10965/41242 [35:38<1:38:14,  5.14it/s, training_loss=0.540]
Epoch 1:  27%|██▋       | 10965/41242 [35:38<1:38:14,  5.14it/s, training_loss=0.408]
Epoch 1:  27%|██▋       | 10966/41242 [35:38<1:37:51,  5.16it/s, training_loss=0.408]
Epoch 1:  27%|██▋       | 10966/41242 [35:39<1:37:51,  5.16it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 10967/41242 [35:39<1:36:29,  5.23it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 10967/41242 [35:39<1:36:29,  5.23it/s, training_loss=0.196]
Epoch 1:  27%|██▋       | 10968/41242 [35:39<1:36:41,  5.22it/s, training_loss=0.196]
Epoch 1:  27%|██▋       | 10968/41242 [35:39<1:36:41,  5.22it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 10969/41242 [35:39<1:36:15,  5.24it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 10969/41242 [35:39<1:36:15,  5.24it/s, training_loss=0.075]
Epoch 1:  27%|██▋       | 10970/41242 [35:39<1:36:30,  5.23it/s, training_loss=0.075]
Epoch 1:  27%|██▋       | 10970/41242 [35:39<1:36:30,  5.23it/s, training_loss=0.312]
Epoch 1:  27%|██▋       | 10971/41242 [35:39<1:37:20,  5.18it/s, training_loss=0.312]
Epoch 1:  27%|██▋       | 10971/41242 [35:39<1:37:20,  5.18it/s, training_loss=0.024]
Epoch 1:  27%|██▋       | 10972/41242 [35:39<1:39:04,  5.09it/s, training_loss=0.024]
Epoch 1:  27%|██▋       | 10972/41242 [35:40<1:39:04,  5.09it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 10973/41242 [35:40<1:37:45,  5.16it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 10973/41242 [35:40<1:37:45,  5.16it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 10974/41242 [35:40<1:37:14,  5.19it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 10974/41242 [35:40<1:37:14,  5.19it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 10975/41242 [35:40<1:36:27,  5.23it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 10975/41242 [35:40<1:36:27,  5.23it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 10976/41242 [35:40<1:35:23,  5.29it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 10976/41242 [35:40<1:35:23,  5.29it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 10977/41242 [35:40<1:34:49,  5.32it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 10977/41242 [35:41<1:34:49,  5.32it/s, training_loss=0.060]
Epoch 1:  27%|██▋       | 10978/41242 [35:41<1:34:55,  5.31it/s, training_loss=0.060]
Epoch 1:  27%|██▋       | 10978/41242 [35:41<1:34:55,  5.31it/s, training_loss=0.240]
Epoch 1:  27%|██▋       | 10979/41242 [35:41<1:36:17,  5.24it/s, training_loss=0.240]
Epoch 1:  27%|██▋       | 10979/41242 [35:41<1:36:17,  5.24it/s, training_loss=0.015]
Epoch 1:  27%|██▋       | 10980/41242 [35:41<1:36:09,  5.24it/s, training_loss=0.015]
Epoch 1:  27%|██▋       | 10980/41242 [35:41<1:36:09,  5.24it/s, training_loss=0.350]
Epoch 1:  27%|██▋       | 10981/41242 [35:41<1:36:13,  5.24it/s, training_loss=0.350]
Epoch 1:  27%|██▋       | 10981/41242 [35:41<1:36:13,  5.24it/s, training_loss=0.021]
Epoch 1:  27%|██▋       | 10982/41242 [35:41<1:35:47,  5.26it/s, training_loss=0.021]
Epoch 1:  27%|██▋       | 10982/41242 [35:42<1:35:47,  5.26it/s, training_loss=0.132]
Epoch 1:  27%|██▋       | 10983/41242 [35:42<1:35:54,  5.26it/s, training_loss=0.132]
Epoch 1:  27%|██▋       | 10983/41242 [35:42<1:35:54,  5.26it/s, training_loss=0.523]
Epoch 1:  27%|██▋       | 10984/41242 [35:42<1:36:16,  5.24it/s, training_loss=0.523]
Epoch 1:  27%|██▋       | 10984/41242 [35:42<1:36:16,  5.24it/s, training_loss=0.014]
Epoch 1:  27%|██▋       | 10985/41242 [35:42<1:36:00,  5.25it/s, training_loss=0.014]
Epoch 1:  27%|██▋       | 10985/41242 [35:42<1:36:00,  5.25it/s, training_loss=0.077]
Epoch 1:  27%|██▋       | 10986/41242 [35:42<1:35:58,  5.25it/s, training_loss=0.077]
Epoch 1:  27%|██▋       | 10986/41242 [35:42<1:35:58,  5.25it/s, training_loss=0.305]
Epoch 1:  27%|██▋       | 10987/41242 [35:42<1:35:51,  5.26it/s, training_loss=0.305]
Epoch 1:  27%|██▋       | 10987/41242 [35:43<1:35:51,  5.26it/s, training_loss=0.026]
Epoch 1:  27%|██▋       | 10988/41242 [35:43<1:35:47,  5.26it/s, training_loss=0.026]
Epoch 1:  27%|██▋       | 10988/41242 [35:43<1:35:47,  5.26it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 10989/41242 [35:43<1:35:26,  5.28it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 10989/41242 [35:43<1:35:26,  5.28it/s, training_loss=0.170]
Epoch 1:  27%|██▋       | 10990/41242 [35:43<1:38:46,  5.10it/s, training_loss=0.170]
Epoch 1:  27%|██▋       | 10990/41242 [35:43<1:38:46,  5.10it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 10991/41242 [35:43<1:38:55,  5.10it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 10991/41242 [35:43<1:38:55,  5.10it/s, training_loss=0.016]
Epoch 1:  27%|██▋       | 10992/41242 [35:43<1:39:02,  5.09it/s, training_loss=0.016]
Epoch 1:  27%|██▋       | 10992/41242 [35:44<1:39:02,  5.09it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 10993/41242 [35:44<1:39:20,  5.08it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 10993/41242 [35:44<1:39:20,  5.08it/s, training_loss=0.012]
Epoch 1:  27%|██▋       | 10994/41242 [35:44<1:39:48,  5.05it/s, training_loss=0.012]
Epoch 1:  27%|██▋       | 10994/41242 [35:44<1:39:48,  5.05it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 10995/41242 [35:44<1:38:13,  5.13it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 10995/41242 [35:44<1:38:13,  5.13it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 10996/41242 [35:44<1:38:26,  5.12it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 10996/41242 [35:44<1:38:26,  5.12it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 10997/41242 [35:44<1:37:52,  5.15it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 10997/41242 [35:44<1:37:52,  5.15it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 10998/41242 [35:44<1:37:25,  5.17it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 10998/41242 [35:45<1:37:25,  5.17it/s, training_loss=0.708]
Epoch 1:  27%|██▋       | 10999/41242 [35:45<1:38:30,  5.12it/s, training_loss=0.708]
Epoch 1:  27%|██▋       | 10999/41242 [35:45<1:38:30,  5.12it/s, training_loss=0.023]
Epoch 1:  27%|██▋       | 11000/41242 [35:45<1:37:04,  5.19it/s, training_loss=0.023]
Epoch 1:  27%|██▋       | 11000/41242 [35:45<1:37:04,  5.19it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11001/41242 [35:45<1:35:56,  5.25it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11001/41242 [35:45<1:35:56,  5.25it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11002/41242 [35:45<1:35:55,  5.25it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11002/41242 [35:45<1:35:55,  5.25it/s, training_loss=0.058]
Epoch 1:  27%|██▋       | 11003/41242 [35:45<1:38:58,  5.09it/s, training_loss=0.058]
Epoch 1:  27%|██▋       | 11003/41242 [35:46<1:38:58,  5.09it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11004/41242 [35:46<1:36:52,  5.20it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11004/41242 [35:46<1:36:52,  5.20it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11005/41242 [35:46<1:37:18,  5.18it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11005/41242 [35:46<1:37:18,  5.18it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11006/41242 [35:46<1:36:04,  5.25it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11006/41242 [35:46<1:36:04,  5.25it/s, training_loss=0.057]
Epoch 1:  27%|██▋       | 11007/41242 [35:46<1:36:17,  5.23it/s, training_loss=0.057]
Epoch 1:  27%|██▋       | 11007/41242 [35:46<1:36:17,  5.23it/s, training_loss=0.211]
Epoch 1:  27%|██▋       | 11008/41242 [35:46<1:36:01,  5.25it/s, training_loss=0.211]
Epoch 1:  27%|██▋       | 11008/41242 [35:47<1:36:01,  5.25it/s, training_loss=0.051]
Epoch 1:  27%|██▋       | 11009/41242 [35:47<1:36:03,  5.25it/s, training_loss=0.051]
Epoch 1:  27%|██▋       | 11009/41242 [35:47<1:36:03,  5.25it/s, training_loss=0.321]
Epoch 1:  27%|██▋       | 11010/41242 [35:47<1:35:53,  5.25it/s, training_loss=0.321]
Epoch 1:  27%|██▋       | 11010/41242 [35:47<1:35:53,  5.25it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11011/41242 [35:47<1:37:01,  5.19it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11011/41242 [35:47<1:37:01,  5.19it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11012/41242 [35:47<1:36:55,  5.20it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11012/41242 [35:47<1:36:55,  5.20it/s, training_loss=0.015]
Epoch 1:  27%|██▋       | 11013/41242 [35:47<1:36:53,  5.20it/s, training_loss=0.015]
Epoch 1:  27%|██▋       | 11013/41242 [35:48<1:36:53,  5.20it/s, training_loss=0.052]
Epoch 1:  27%|██▋       | 11014/41242 [35:48<1:38:32,  5.11it/s, training_loss=0.052]
Epoch 1:  27%|██▋       | 11014/41242 [35:48<1:38:32,  5.11it/s, training_loss=0.147]
Epoch 1:  27%|██▋       | 11015/41242 [35:48<1:38:05,  5.14it/s, training_loss=0.147]
Epoch 1:  27%|██▋       | 11015/41242 [35:48<1:38:05,  5.14it/s, training_loss=0.014]
Epoch 1:  27%|██▋       | 11016/41242 [35:48<1:40:08,  5.03it/s, training_loss=0.014]
Epoch 1:  27%|██▋       | 11016/41242 [35:48<1:40:08,  5.03it/s, training_loss=0.010]
Epoch 1:  27%|██▋       | 11017/41242 [35:48<1:41:07,  4.98it/s, training_loss=0.010]
Epoch 1:  27%|██▋       | 11017/41242 [35:48<1:41:07,  4.98it/s, training_loss=0.011]
Epoch 1:  27%|██▋       | 11018/41242 [35:48<1:39:20,  5.07it/s, training_loss=0.011]
Epoch 1:  27%|██▋       | 11018/41242 [35:49<1:39:20,  5.07it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11019/41242 [35:49<1:38:56,  5.09it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11019/41242 [35:49<1:38:56,  5.09it/s, training_loss=0.018]
Epoch 1:  27%|██▋       | 11020/41242 [35:49<1:38:11,  5.13it/s, training_loss=0.018]
Epoch 1:  27%|██▋       | 11020/41242 [35:49<1:38:11,  5.13it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11021/41242 [35:49<1:37:32,  5.16it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11021/41242 [35:49<1:37:32,  5.16it/s, training_loss=0.316]
Epoch 1:  27%|██▋       | 11022/41242 [35:49<1:37:35,  5.16it/s, training_loss=0.316]
Epoch 1:  27%|██▋       | 11022/41242 [35:49<1:37:35,  5.16it/s, training_loss=0.017]
Epoch 1:  27%|██▋       | 11023/41242 [35:49<1:38:29,  5.11it/s, training_loss=0.017]
Epoch 1:  27%|██▋       | 11023/41242 [35:50<1:38:29,  5.11it/s, training_loss=0.089]
Epoch 1:  27%|██▋       | 11024/41242 [35:50<1:38:09,  5.13it/s, training_loss=0.089]
Epoch 1:  27%|██▋       | 11024/41242 [35:50<1:38:09,  5.13it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11025/41242 [35:50<1:36:29,  5.22it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11025/41242 [35:50<1:36:29,  5.22it/s, training_loss=0.028]
Epoch 1:  27%|██▋       | 11026/41242 [35:50<1:37:57,  5.14it/s, training_loss=0.028]
Epoch 1:  27%|██▋       | 11026/41242 [35:50<1:37:57,  5.14it/s, training_loss=0.037]
Epoch 1:  27%|██▋       | 11027/41242 [35:50<1:39:29,  5.06it/s, training_loss=0.037]
Epoch 1:  27%|██▋       | 11027/41242 [35:50<1:39:29,  5.06it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11028/41242 [35:50<1:40:12,  5.03it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11028/41242 [35:51<1:40:12,  5.03it/s, training_loss=0.322]
Epoch 1:  27%|██▋       | 11029/41242 [35:51<1:41:28,  4.96it/s, training_loss=0.322]
Epoch 1:  27%|██▋       | 11029/41242 [35:51<1:41:28,  4.96it/s, training_loss=0.045]
Epoch 1:  27%|██▋       | 11030/41242 [35:51<1:42:14,  4.92it/s, training_loss=0.045]
Epoch 1:  27%|██▋       | 11030/41242 [35:51<1:42:14,  4.92it/s, training_loss=0.010]
Epoch 1:  27%|██▋       | 11031/41242 [35:51<1:41:34,  4.96it/s, training_loss=0.010]
Epoch 1:  27%|██▋       | 11031/41242 [35:51<1:41:34,  4.96it/s, training_loss=0.022]
Epoch 1:  27%|██▋       | 11032/41242 [35:51<1:43:52,  4.85it/s, training_loss=0.022]
Epoch 1:  27%|██▋       | 11032/41242 [35:51<1:43:52,  4.85it/s, training_loss=0.032]
Epoch 1:  27%|██▋       | 11033/41242 [35:51<1:44:05,  4.84it/s, training_loss=0.032]
Epoch 1:  27%|██▋       | 11033/41242 [35:52<1:44:05,  4.84it/s, training_loss=0.015]
Epoch 1:  27%|██▋       | 11034/41242 [35:52<1:42:15,  4.92it/s, training_loss=0.015]
Epoch 1:  27%|██▋       | 11034/41242 [35:52<1:42:15,  4.92it/s, training_loss=0.011]
Epoch 1:  27%|██▋       | 11035/41242 [35:52<1:40:59,  4.98it/s, training_loss=0.011]
Epoch 1:  27%|██▋       | 11035/41242 [35:52<1:40:59,  4.98it/s, training_loss=0.119]
Epoch 1:  27%|██▋       | 11036/41242 [35:52<1:39:37,  5.05it/s, training_loss=0.119]
Epoch 1:  27%|██▋       | 11036/41242 [35:52<1:39:37,  5.05it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11037/41242 [35:52<1:39:14,  5.07it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11037/41242 [35:52<1:39:14,  5.07it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11038/41242 [35:52<1:37:29,  5.16it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11038/41242 [35:52<1:37:29,  5.16it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11039/41242 [35:52<1:36:06,  5.24it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11039/41242 [35:53<1:36:06,  5.24it/s, training_loss=0.385]
Epoch 1:  27%|██▋       | 11040/41242 [35:53<1:36:00,  5.24it/s, training_loss=0.385]
Epoch 1:  27%|██▋       | 11040/41242 [35:53<1:36:00,  5.24it/s, training_loss=0.288]
Epoch 1:  27%|██▋       | 11041/41242 [35:53<1:35:39,  5.26it/s, training_loss=0.288]
Epoch 1:  27%|██▋       | 11041/41242 [35:53<1:35:39,  5.26it/s, training_loss=1.001]
Epoch 1:  27%|██▋       | 11042/41242 [35:53<1:35:52,  5.25it/s, training_loss=1.001]
Epoch 1:  27%|██▋       | 11042/41242 [35:53<1:35:52,  5.25it/s, training_loss=0.029]
Epoch 1:  27%|██▋       | 11043/41242 [35:53<1:36:16,  5.23it/s, training_loss=0.029]
Epoch 1:  27%|██▋       | 11043/41242 [35:53<1:36:16,  5.23it/s, training_loss=0.012]
Epoch 1:  27%|██▋       | 11044/41242 [35:53<1:36:02,  5.24it/s, training_loss=0.012]
Epoch 1:  27%|██▋       | 11044/41242 [35:54<1:36:02,  5.24it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11045/41242 [35:54<1:35:47,  5.25it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11045/41242 [35:54<1:35:47,  5.25it/s, training_loss=0.013]
Epoch 1:  27%|██▋       | 11046/41242 [35:54<1:36:03,  5.24it/s, training_loss=0.013]
Epoch 1:  27%|██▋       | 11046/41242 [35:54<1:36:03,  5.24it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11047/41242 [35:54<1:35:13,  5.28it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11047/41242 [35:54<1:35:13,  5.28it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11048/41242 [35:54<1:34:19,  5.34it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11048/41242 [35:54<1:34:19,  5.34it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11049/41242 [35:54<1:36:13,  5.23it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11049/41242 [35:55<1:36:13,  5.23it/s, training_loss=0.030]
Epoch 1:  27%|██▋       | 11050/41242 [35:55<1:37:56,  5.14it/s, training_loss=0.030]
Epoch 1:  27%|██▋       | 11050/41242 [35:55<1:37:56,  5.14it/s, training_loss=0.156]
Epoch 1:  27%|██▋       | 11051/41242 [35:55<1:39:32,  5.05it/s, training_loss=0.156]
Epoch 1:  27%|██▋       | 11051/41242 [35:55<1:39:32,  5.05it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11052/41242 [35:55<1:39:19,  5.07it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11052/41242 [35:55<1:39:19,  5.07it/s, training_loss=0.019]
Epoch 1:  27%|██▋       | 11053/41242 [35:55<1:40:20,  5.01it/s, training_loss=0.019]
Epoch 1:  27%|██▋       | 11053/41242 [35:55<1:40:20,  5.01it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11054/41242 [35:55<1:40:45,  4.99it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11054/41242 [35:56<1:40:45,  4.99it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11055/41242 [35:56<1:40:58,  4.98it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11055/41242 [35:56<1:40:58,  4.98it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11056/41242 [35:56<1:40:28,  5.01it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11056/41242 [35:56<1:40:28,  5.01it/s, training_loss=0.126]
Epoch 1:  27%|██▋       | 11057/41242 [35:56<1:40:09,  5.02it/s, training_loss=0.126]
Epoch 1:  27%|██▋       | 11057/41242 [35:56<1:40:09,  5.02it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11058/41242 [35:56<1:40:21,  5.01it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11058/41242 [35:56<1:40:21,  5.01it/s, training_loss=0.200]
Epoch 1:  27%|██▋       | 11059/41242 [35:56<1:40:49,  4.99it/s, training_loss=0.200]
Epoch 1:  27%|██▋       | 11059/41242 [35:57<1:40:49,  4.99it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 11060/41242 [35:57<1:41:18,  4.97it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 11060/41242 [35:57<1:41:18,  4.97it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11061/41242 [35:57<1:41:30,  4.96it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11061/41242 [35:57<1:41:30,  4.96it/s, training_loss=0.206]
Epoch 1:  27%|██▋       | 11062/41242 [35:57<1:40:59,  4.98it/s, training_loss=0.206]
Epoch 1:  27%|██▋       | 11062/41242 [35:57<1:40:59,  4.98it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11063/41242 [35:57<1:41:51,  4.94it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11063/41242 [35:57<1:41:51,  4.94it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11064/41242 [35:57<1:40:56,  4.98it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11064/41242 [35:58<1:40:56,  4.98it/s, training_loss=0.017]
Epoch 1:  27%|██▋       | 11065/41242 [35:58<1:41:26,  4.96it/s, training_loss=0.017]
Epoch 1:  27%|██▋       | 11065/41242 [35:58<1:41:26,  4.96it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 11066/41242 [35:58<1:40:03,  5.03it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 11066/41242 [35:58<1:40:03,  5.03it/s, training_loss=0.013]
Epoch 1:  27%|██▋       | 11067/41242 [35:58<1:40:29,  5.00it/s, training_loss=0.013]
Epoch 1:  27%|██▋       | 11067/41242 [35:58<1:40:29,  5.00it/s, training_loss=0.060]
Epoch 1:  27%|██▋       | 11068/41242 [35:58<1:41:26,  4.96it/s, training_loss=0.060]
Epoch 1:  27%|██▋       | 11068/41242 [35:58<1:41:26,  4.96it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11069/41242 [35:58<1:42:20,  4.91it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11069/41242 [35:59<1:42:20,  4.91it/s, training_loss=0.047]
Epoch 1:  27%|██▋       | 11070/41242 [35:59<1:40:25,  5.01it/s, training_loss=0.047]
Epoch 1:  27%|██▋       | 11070/41242 [35:59<1:40:25,  5.01it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11071/41242 [35:59<1:40:00,  5.03it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11071/41242 [35:59<1:40:00,  5.03it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11072/41242 [35:59<1:40:27,  5.01it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11072/41242 [35:59<1:40:27,  5.01it/s, training_loss=0.019]
Epoch 1:  27%|██▋       | 11073/41242 [35:59<1:41:50,  4.94it/s, training_loss=0.019]
Epoch 1:  27%|██▋       | 11073/41242 [35:59<1:41:50,  4.94it/s, training_loss=0.211]
Epoch 1:  27%|██▋       | 11074/41242 [35:59<1:42:32,  4.90it/s, training_loss=0.211]
Epoch 1:  27%|██▋       | 11074/41242 [36:00<1:42:32,  4.90it/s, training_loss=0.010]
Epoch 1:  27%|██▋       | 11075/41242 [36:00<1:41:00,  4.98it/s, training_loss=0.010]
Epoch 1:  27%|██▋       | 11075/41242 [36:00<1:41:00,  4.98it/s, training_loss=0.468]
Epoch 1:  27%|██▋       | 11076/41242 [36:00<1:38:50,  5.09it/s, training_loss=0.468]
Epoch 1:  27%|██▋       | 11076/41242 [36:00<1:38:50,  5.09it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11077/41242 [36:00<1:37:10,  5.17it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11077/41242 [36:00<1:37:10,  5.17it/s, training_loss=0.069]
Epoch 1:  27%|██▋       | 11078/41242 [36:00<1:36:26,  5.21it/s, training_loss=0.069]
Epoch 1:  27%|██▋       | 11078/41242 [36:00<1:36:26,  5.21it/s, training_loss=0.240]
Epoch 1:  27%|██▋       | 11079/41242 [36:00<1:37:08,  5.18it/s, training_loss=0.240]
Epoch 1:  27%|██▋       | 11079/41242 [36:01<1:37:08,  5.18it/s, training_loss=0.186]
Epoch 1:  27%|██▋       | 11080/41242 [36:01<1:40:15,  5.01it/s, training_loss=0.186]
Epoch 1:  27%|██▋       | 11080/41242 [36:01<1:40:15,  5.01it/s, training_loss=0.519]
Epoch 1:  27%|██▋       | 11081/41242 [36:01<1:41:16,  4.96it/s, training_loss=0.519]
Epoch 1:  27%|██▋       | 11081/41242 [36:01<1:41:16,  4.96it/s, training_loss=0.097]
Epoch 1:  27%|██▋       | 11082/41242 [36:01<1:39:03,  5.07it/s, training_loss=0.097]
Epoch 1:  27%|██▋       | 11082/41242 [36:01<1:39:03,  5.07it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11083/41242 [36:01<1:38:02,  5.13it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11083/41242 [36:01<1:38:02,  5.13it/s, training_loss=0.371]
Epoch 1:  27%|██▋       | 11084/41242 [36:01<1:37:15,  5.17it/s, training_loss=0.371]
Epoch 1:  27%|██▋       | 11084/41242 [36:02<1:37:15,  5.17it/s, training_loss=0.168]
Epoch 1:  27%|██▋       | 11085/41242 [36:02<1:37:30,  5.15it/s, training_loss=0.168]
Epoch 1:  27%|██▋       | 11085/41242 [36:02<1:37:30,  5.15it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11086/41242 [36:02<1:37:59,  5.13it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11086/41242 [36:02<1:37:59,  5.13it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11087/41242 [36:02<1:39:12,  5.07it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11087/41242 [36:02<1:39:12,  5.07it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11088/41242 [36:02<1:40:00,  5.03it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11088/41242 [36:02<1:40:00,  5.03it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11089/41242 [36:02<1:39:24,  5.06it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11089/41242 [36:03<1:39:24,  5.06it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11090/41242 [36:03<1:39:07,  5.07it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11090/41242 [36:03<1:39:07,  5.07it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11091/41242 [36:03<1:38:02,  5.13it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11091/41242 [36:03<1:38:02,  5.13it/s, training_loss=0.815]
Epoch 1:  27%|██▋       | 11092/41242 [36:03<1:38:29,  5.10it/s, training_loss=0.815]
Epoch 1:  27%|██▋       | 11092/41242 [36:03<1:38:29,  5.10it/s, training_loss=0.020]
Epoch 1:  27%|██▋       | 11093/41242 [36:03<1:37:57,  5.13it/s, training_loss=0.020]
Epoch 1:  27%|██▋       | 11093/41242 [36:03<1:37:57,  5.13it/s, training_loss=0.055]
Epoch 1:  27%|██▋       | 11094/41242 [36:03<1:38:25,  5.11it/s, training_loss=0.055]
Epoch 1:  27%|██▋       | 11094/41242 [36:04<1:38:25,  5.11it/s, training_loss=0.010]
Epoch 1:  27%|██▋       | 11095/41242 [36:04<1:38:37,  5.09it/s, training_loss=0.010]
Epoch 1:  27%|██▋       | 11095/41242 [36:04<1:38:37,  5.09it/s, training_loss=0.012]
Epoch 1:  27%|██▋       | 11096/41242 [36:04<1:38:52,  5.08it/s, training_loss=0.012]
Epoch 1:  27%|██▋       | 11096/41242 [36:04<1:38:52,  5.08it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11097/41242 [36:04<1:38:35,  5.10it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11097/41242 [36:04<1:38:35,  5.10it/s, training_loss=0.239]
Epoch 1:  27%|██▋       | 11098/41242 [36:04<1:40:23,  5.00it/s, training_loss=0.239]
Epoch 1:  27%|██▋       | 11098/41242 [36:04<1:40:23,  5.00it/s, training_loss=0.310]
Epoch 1:  27%|██▋       | 11099/41242 [36:04<1:40:06,  5.02it/s, training_loss=0.310]
Epoch 1:  27%|██▋       | 11099/41242 [36:05<1:40:06,  5.02it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11100/41242 [36:05<1:38:29,  5.10it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11100/41242 [36:05<1:38:29,  5.10it/s, training_loss=0.209]
Epoch 1:  27%|██▋       | 11101/41242 [36:05<1:41:20,  4.96it/s, training_loss=0.209]
Epoch 1:  27%|██▋       | 11101/41242 [36:05<1:41:20,  4.96it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11102/41242 [36:05<1:40:43,  4.99it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11102/41242 [36:05<1:40:43,  4.99it/s, training_loss=0.010]
Epoch 1:  27%|██▋       | 11103/41242 [36:05<1:39:27,  5.05it/s, training_loss=0.010]
Epoch 1:  27%|██▋       | 11103/41242 [36:05<1:39:27,  5.05it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11104/41242 [36:05<1:37:44,  5.14it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11104/41242 [36:06<1:37:44,  5.14it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11105/41242 [36:06<1:38:14,  5.11it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11105/41242 [36:06<1:38:14,  5.11it/s, training_loss=0.231]
Epoch 1:  27%|██▋       | 11106/41242 [36:06<1:37:53,  5.13it/s, training_loss=0.231]
Epoch 1:  27%|██▋       | 11106/41242 [36:06<1:37:53,  5.13it/s, training_loss=0.010]
Epoch 1:  27%|██▋       | 11107/41242 [36:06<1:38:12,  5.11it/s, training_loss=0.010]
Epoch 1:  27%|██▋       | 11107/41242 [36:06<1:38:12,  5.11it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 11108/41242 [36:06<1:37:27,  5.15it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 11108/41242 [36:06<1:37:27,  5.15it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11109/41242 [36:06<1:39:27,  5.05it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11109/41242 [36:06<1:39:27,  5.05it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11110/41242 [36:07<1:39:15,  5.06it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11110/41242 [36:07<1:39:15,  5.06it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 11111/41242 [36:07<1:37:31,  5.15it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 11111/41242 [36:07<1:37:31,  5.15it/s, training_loss=0.157]
Epoch 1:  27%|██▋       | 11112/41242 [36:07<1:37:04,  5.17it/s, training_loss=0.157]
Epoch 1:  27%|██▋       | 11112/41242 [36:07<1:37:04,  5.17it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11113/41242 [36:07<1:37:25,  5.15it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11113/41242 [36:07<1:37:25,  5.15it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11114/41242 [36:07<1:37:43,  5.14it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11114/41242 [36:07<1:37:43,  5.14it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11115/41242 [36:07<1:37:33,  5.15it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11115/41242 [36:08<1:37:33,  5.15it/s, training_loss=0.024]
Epoch 1:  27%|██▋       | 11116/41242 [36:08<1:38:22,  5.10it/s, training_loss=0.024]
Epoch 1:  27%|██▋       | 11116/41242 [36:08<1:38:22,  5.10it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11117/41242 [36:08<1:36:28,  5.20it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11117/41242 [36:08<1:36:28,  5.20it/s, training_loss=0.125]
Epoch 1:  27%|██▋       | 11118/41242 [36:08<1:36:12,  5.22it/s, training_loss=0.125]
Epoch 1:  27%|██▋       | 11118/41242 [36:08<1:36:12,  5.22it/s, training_loss=0.012]
Epoch 1:  27%|██▋       | 11119/41242 [36:08<1:35:59,  5.23it/s, training_loss=0.012]
Epoch 1:  27%|██▋       | 11119/41242 [36:08<1:35:59,  5.23it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11120/41242 [36:08<1:35:55,  5.23it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11120/41242 [36:09<1:35:55,  5.23it/s, training_loss=0.060]
Epoch 1:  27%|██▋       | 11121/41242 [36:09<1:36:59,  5.18it/s, training_loss=0.060]
Epoch 1:  27%|██▋       | 11121/41242 [36:09<1:36:59,  5.18it/s, training_loss=0.642]
Epoch 1:  27%|██▋       | 11122/41242 [36:09<1:39:10,  5.06it/s, training_loss=0.642]
Epoch 1:  27%|██▋       | 11122/41242 [36:09<1:39:10,  5.06it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11123/41242 [36:09<1:40:02,  5.02it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11123/41242 [36:09<1:40:02,  5.02it/s, training_loss=0.012]
Epoch 1:  27%|██▋       | 11124/41242 [36:09<1:40:26,  5.00it/s, training_loss=0.012]
Epoch 1:  27%|██▋       | 11124/41242 [36:09<1:40:26,  5.00it/s, training_loss=0.169]
Epoch 1:  27%|██▋       | 11125/41242 [36:09<1:40:11,  5.01it/s, training_loss=0.169]
Epoch 1:  27%|██▋       | 11125/41242 [36:10<1:40:11,  5.01it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11126/41242 [36:10<1:40:58,  4.97it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11126/41242 [36:10<1:40:58,  4.97it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11127/41242 [36:10<1:41:24,  4.95it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11127/41242 [36:10<1:41:24,  4.95it/s, training_loss=0.045]
Epoch 1:  27%|██▋       | 11128/41242 [36:10<1:40:14,  5.01it/s, training_loss=0.045]
Epoch 1:  27%|██▋       | 11128/41242 [36:10<1:40:14,  5.01it/s, training_loss=0.015]
Epoch 1:  27%|██▋       | 11129/41242 [36:10<1:40:05,  5.01it/s, training_loss=0.015]
Epoch 1:  27%|██▋       | 11129/41242 [36:10<1:40:05,  5.01it/s, training_loss=0.903]
Epoch 1:  27%|██▋       | 11130/41242 [36:10<1:40:03,  5.02it/s, training_loss=0.903]
Epoch 1:  27%|██▋       | 11130/41242 [36:11<1:40:03,  5.02it/s, training_loss=0.024]
Epoch 1:  27%|██▋       | 11131/41242 [36:11<1:40:14,  5.01it/s, training_loss=0.024]
Epoch 1:  27%|██▋       | 11131/41242 [36:11<1:40:14,  5.01it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11132/41242 [36:11<1:38:17,  5.11it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11132/41242 [36:11<1:38:17,  5.11it/s, training_loss=0.331]
Epoch 1:  27%|██▋       | 11133/41242 [36:11<1:40:50,  4.98it/s, training_loss=0.331]
Epoch 1:  27%|██▋       | 11133/41242 [36:11<1:40:50,  4.98it/s, training_loss=0.027]
Epoch 1:  27%|██▋       | 11134/41242 [36:11<1:40:19,  5.00it/s, training_loss=0.027]
Epoch 1:  27%|██▋       | 11134/41242 [36:11<1:40:19,  5.00it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11135/41242 [36:11<1:38:36,  5.09it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11135/41242 [36:12<1:38:36,  5.09it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11136/41242 [36:12<1:38:31,  5.09it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11136/41242 [36:12<1:38:31,  5.09it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 11137/41242 [36:12<1:37:22,  5.15it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 11137/41242 [36:12<1:37:22,  5.15it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11138/41242 [36:12<1:38:23,  5.10it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11138/41242 [36:12<1:38:23,  5.10it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11139/41242 [36:12<1:38:33,  5.09it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11139/41242 [36:12<1:38:33,  5.09it/s, training_loss=0.023]
Epoch 1:  27%|██▋       | 11140/41242 [36:12<1:40:31,  4.99it/s, training_loss=0.023]
Epoch 1:  27%|██▋       | 11140/41242 [36:13<1:40:31,  4.99it/s, training_loss=0.015]
Epoch 1:  27%|██▋       | 11141/41242 [36:13<1:40:13,  5.01it/s, training_loss=0.015]
Epoch 1:  27%|██▋       | 11141/41242 [36:13<1:40:13,  5.01it/s, training_loss=0.019]
Epoch 1:  27%|██▋       | 11142/41242 [36:13<1:40:04,  5.01it/s, training_loss=0.019]
Epoch 1:  27%|██▋       | 11142/41242 [36:13<1:40:04,  5.01it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11143/41242 [36:13<1:37:42,  5.13it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11143/41242 [36:13<1:37:42,  5.13it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11144/41242 [36:13<1:36:21,  5.21it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11144/41242 [36:13<1:36:21,  5.21it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11145/41242 [36:13<1:35:59,  5.23it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11145/41242 [36:14<1:35:59,  5.23it/s, training_loss=0.014]
Epoch 1:  27%|██▋       | 11146/41242 [36:14<1:36:10,  5.22it/s, training_loss=0.014]
Epoch 1:  27%|██▋       | 11146/41242 [36:14<1:36:10,  5.22it/s, training_loss=0.233]
Epoch 1:  27%|██▋       | 11147/41242 [36:14<1:35:40,  5.24it/s, training_loss=0.233]
Epoch 1:  27%|██▋       | 11147/41242 [36:14<1:35:40,  5.24it/s, training_loss=0.221]
Epoch 1:  27%|██▋       | 11148/41242 [36:14<1:35:20,  5.26it/s, training_loss=0.221]
Epoch 1:  27%|██▋       | 11148/41242 [36:14<1:35:20,  5.26it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11149/41242 [36:14<1:35:02,  5.28it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11149/41242 [36:14<1:35:02,  5.28it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11150/41242 [36:14<1:34:55,  5.28it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11150/41242 [36:15<1:34:55,  5.28it/s, training_loss=0.102]
Epoch 1:  27%|██▋       | 11151/41242 [36:15<1:35:26,  5.25it/s, training_loss=0.102]
Epoch 1:  27%|██▋       | 11151/41242 [36:15<1:35:26,  5.25it/s, training_loss=0.166]
Epoch 1:  27%|██▋       | 11152/41242 [36:15<1:35:47,  5.24it/s, training_loss=0.166]
Epoch 1:  27%|██▋       | 11152/41242 [36:15<1:35:47,  5.24it/s, training_loss=0.818]
Epoch 1:  27%|██▋       | 11153/41242 [36:15<1:36:30,  5.20it/s, training_loss=0.818]
Epoch 1:  27%|██▋       | 11153/41242 [36:15<1:36:30,  5.20it/s, training_loss=0.041]
Epoch 1:  27%|██▋       | 11154/41242 [36:15<1:36:06,  5.22it/s, training_loss=0.041]
Epoch 1:  27%|██▋       | 11154/41242 [36:15<1:36:06,  5.22it/s, training_loss=0.107]
Epoch 1:  27%|██▋       | 11155/41242 [36:15<1:36:00,  5.22it/s, training_loss=0.107]
Epoch 1:  27%|██▋       | 11155/41242 [36:15<1:36:00,  5.22it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11156/41242 [36:15<1:34:58,  5.28it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11156/41242 [36:16<1:34:58,  5.28it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11157/41242 [36:16<1:34:22,  5.31it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11157/41242 [36:16<1:34:22,  5.31it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11158/41242 [36:16<1:33:40,  5.35it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11158/41242 [36:16<1:33:40,  5.35it/s, training_loss=0.085]
Epoch 1:  27%|██▋       | 11159/41242 [36:16<1:33:49,  5.34it/s, training_loss=0.085]
Epoch 1:  27%|██▋       | 11159/41242 [36:16<1:33:49,  5.34it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11160/41242 [36:16<1:34:47,  5.29it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11160/41242 [36:16<1:34:47,  5.29it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11161/41242 [36:16<1:33:27,  5.36it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11161/41242 [36:17<1:33:27,  5.36it/s, training_loss=0.364]
Epoch 1:  27%|██▋       | 11162/41242 [36:17<1:34:04,  5.33it/s, training_loss=0.364]
Epoch 1:  27%|██▋       | 11162/41242 [36:17<1:34:04,  5.33it/s, training_loss=0.017]
Epoch 1:  27%|██▋       | 11163/41242 [36:17<1:34:56,  5.28it/s, training_loss=0.017]
Epoch 1:  27%|██▋       | 11163/41242 [36:17<1:34:56,  5.28it/s, training_loss=0.090]
Epoch 1:  27%|██▋       | 11164/41242 [36:17<1:34:55,  5.28it/s, training_loss=0.090]
Epoch 1:  27%|██▋       | 11164/41242 [36:17<1:34:55,  5.28it/s, training_loss=0.111]
Epoch 1:  27%|██▋       | 11165/41242 [36:17<1:34:51,  5.28it/s, training_loss=0.111]
Epoch 1:  27%|██▋       | 11165/41242 [36:17<1:34:51,  5.28it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11166/41242 [36:17<1:36:29,  5.19it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11166/41242 [36:18<1:36:29,  5.19it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11167/41242 [36:18<1:36:19,  5.20it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11167/41242 [36:18<1:36:19,  5.20it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11168/41242 [36:18<1:35:23,  5.25it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11168/41242 [36:18<1:35:23,  5.25it/s, training_loss=0.071]
Epoch 1:  27%|██▋       | 11169/41242 [36:18<1:37:46,  5.13it/s, training_loss=0.071]
Epoch 1:  27%|██▋       | 11169/41242 [36:18<1:37:46,  5.13it/s, training_loss=0.020]
Epoch 1:  27%|██▋       | 11170/41242 [36:18<1:38:36,  5.08it/s, training_loss=0.020]
Epoch 1:  27%|██▋       | 11170/41242 [36:18<1:38:36,  5.08it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 11171/41242 [36:18<1:37:32,  5.14it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 11171/41242 [36:19<1:37:32,  5.14it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11172/41242 [36:19<1:37:58,  5.11it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11172/41242 [36:19<1:37:58,  5.11it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11173/41242 [36:19<1:39:29,  5.04it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11173/41242 [36:19<1:39:29,  5.04it/s, training_loss=0.099]
Epoch 1:  27%|██▋       | 11174/41242 [36:19<1:39:52,  5.02it/s, training_loss=0.099]
Epoch 1:  27%|██▋       | 11174/41242 [36:19<1:39:52,  5.02it/s, training_loss=0.066]
Epoch 1:  27%|██▋       | 11175/41242 [36:19<1:39:14,  5.05it/s, training_loss=0.066]
Epoch 1:  27%|██▋       | 11175/41242 [36:19<1:39:14,  5.05it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11176/41242 [36:19<1:40:21,  4.99it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11176/41242 [36:20<1:40:21,  4.99it/s, training_loss=0.101]
Epoch 1:  27%|██▋       | 11177/41242 [36:20<1:38:38,  5.08it/s, training_loss=0.101]
Epoch 1:  27%|██▋       | 11177/41242 [36:20<1:38:38,  5.08it/s, training_loss=0.243]
Epoch 1:  27%|██▋       | 11178/41242 [36:20<1:38:02,  5.11it/s, training_loss=0.243]
Epoch 1:  27%|██▋       | 11178/41242 [36:20<1:38:02,  5.11it/s, training_loss=0.015]
Epoch 1:  27%|██▋       | 11179/41242 [36:20<1:37:21,  5.15it/s, training_loss=0.015]
Epoch 1:  27%|██▋       | 11179/41242 [36:20<1:37:21,  5.15it/s, training_loss=0.409]
Epoch 1:  27%|██▋       | 11180/41242 [36:20<1:37:29,  5.14it/s, training_loss=0.409]
Epoch 1:  27%|██▋       | 11180/41242 [36:20<1:37:29,  5.14it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11181/41242 [36:20<1:36:13,  5.21it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11181/41242 [36:20<1:36:13,  5.21it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11182/41242 [36:20<1:34:49,  5.28it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11182/41242 [36:21<1:34:49,  5.28it/s, training_loss=0.372]
Epoch 1:  27%|██▋       | 11183/41242 [36:21<1:34:06,  5.32it/s, training_loss=0.372]
Epoch 1:  27%|██▋       | 11183/41242 [36:21<1:34:06,  5.32it/s, training_loss=0.397]
Epoch 1:  27%|██▋       | 11184/41242 [36:21<1:34:19,  5.31it/s, training_loss=0.397]
Epoch 1:  27%|██▋       | 11184/41242 [36:21<1:34:19,  5.31it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11185/41242 [36:21<1:34:21,  5.31it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11185/41242 [36:21<1:34:21,  5.31it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11186/41242 [36:21<1:35:00,  5.27it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11186/41242 [36:21<1:35:00,  5.27it/s, training_loss=0.112]
Epoch 1:  27%|██▋       | 11187/41242 [36:21<1:37:46,  5.12it/s, training_loss=0.112]
Epoch 1:  27%|██▋       | 11187/41242 [36:22<1:37:46,  5.12it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11188/41242 [36:22<1:37:13,  5.15it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11188/41242 [36:22<1:37:13,  5.15it/s, training_loss=0.052]
Epoch 1:  27%|██▋       | 11189/41242 [36:22<1:39:11,  5.05it/s, training_loss=0.052]
Epoch 1:  27%|██▋       | 11189/41242 [36:22<1:39:11,  5.05it/s, training_loss=0.047]
Epoch 1:  27%|██▋       | 11190/41242 [36:22<1:40:17,  4.99it/s, training_loss=0.047]
Epoch 1:  27%|██▋       | 11190/41242 [36:22<1:40:17,  4.99it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11191/41242 [36:22<1:38:43,  5.07it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11191/41242 [36:22<1:38:43,  5.07it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11192/41242 [36:22<1:39:40,  5.02it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11192/41242 [36:23<1:39:40,  5.02it/s, training_loss=0.022]
Epoch 1:  27%|██▋       | 11193/41242 [36:23<1:40:29,  4.98it/s, training_loss=0.022]
Epoch 1:  27%|██▋       | 11193/41242 [36:23<1:40:29,  4.98it/s, training_loss=0.212]
Epoch 1:  27%|██▋       | 11194/41242 [36:23<1:40:23,  4.99it/s, training_loss=0.212]
Epoch 1:  27%|██▋       | 11194/41242 [36:23<1:40:23,  4.99it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11195/41242 [36:23<1:40:47,  4.97it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11195/41242 [36:23<1:40:47,  4.97it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11196/41242 [36:23<1:42:16,  4.90it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11196/41242 [36:23<1:42:16,  4.90it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11197/41242 [36:23<1:39:56,  5.01it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11197/41242 [36:24<1:39:56,  5.01it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11198/41242 [36:24<1:38:44,  5.07it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11198/41242 [36:24<1:38:44,  5.07it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11199/41242 [36:24<1:37:44,  5.12it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11199/41242 [36:24<1:37:44,  5.12it/s, training_loss=0.048]
Epoch 1:  27%|██▋       | 11200/41242 [36:24<1:37:09,  5.15it/s, training_loss=0.048]
Epoch 1:  27%|██▋       | 11200/41242 [36:24<1:37:09,  5.15it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 11201/41242 [36:24<1:35:39,  5.23it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 11201/41242 [36:24<1:35:39,  5.23it/s, training_loss=0.184]
Epoch 1:  27%|██▋       | 11202/41242 [36:24<1:37:31,  5.13it/s, training_loss=0.184]
Epoch 1:  27%|██▋       | 11202/41242 [36:25<1:37:31,  5.13it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11203/41242 [36:25<1:37:41,  5.13it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11203/41242 [36:25<1:37:41,  5.13it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11204/41242 [36:25<1:38:43,  5.07it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11204/41242 [36:25<1:38:43,  5.07it/s, training_loss=0.902]
Epoch 1:  27%|██▋       | 11205/41242 [36:25<1:40:06,  5.00it/s, training_loss=0.902]
Epoch 1:  27%|██▋       | 11205/41242 [36:25<1:40:06,  5.00it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11206/41242 [36:25<1:39:14,  5.04it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11206/41242 [36:25<1:39:14,  5.04it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11207/41242 [36:25<1:39:42,  5.02it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11207/41242 [36:26<1:39:42,  5.02it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11208/41242 [36:26<1:39:57,  5.01it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11208/41242 [36:26<1:39:57,  5.01it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11209/41242 [36:26<1:40:02,  5.00it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11209/41242 [36:26<1:40:02,  5.00it/s, training_loss=0.011]
Epoch 1:  27%|██▋       | 11210/41242 [36:26<1:40:16,  4.99it/s, training_loss=0.011]
Epoch 1:  27%|██▋       | 11210/41242 [36:26<1:40:16,  4.99it/s, training_loss=0.029]
Epoch 1:  27%|██▋       | 11211/41242 [36:26<1:40:39,  4.97it/s, training_loss=0.029]
Epoch 1:  27%|██▋       | 11211/41242 [36:26<1:40:39,  4.97it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11212/41242 [36:26<1:41:33,  4.93it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11212/41242 [36:27<1:41:33,  4.93it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11213/41242 [36:27<1:38:47,  5.07it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11213/41242 [36:27<1:38:47,  5.07it/s, training_loss=0.011]
Epoch 1:  27%|██▋       | 11214/41242 [36:27<1:38:41,  5.07it/s, training_loss=0.011]
Epoch 1:  27%|██▋       | 11214/41242 [36:27<1:38:41,  5.07it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 11215/41242 [36:27<1:37:48,  5.12it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 11215/41242 [36:27<1:37:48,  5.12it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11216/41242 [36:27<1:36:51,  5.17it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11216/41242 [36:27<1:36:51,  5.17it/s, training_loss=0.067]
Epoch 1:  27%|██▋       | 11217/41242 [36:27<1:36:57,  5.16it/s, training_loss=0.067]
Epoch 1:  27%|██▋       | 11217/41242 [36:28<1:36:57,  5.16it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11218/41242 [36:28<1:38:53,  5.06it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11218/41242 [36:28<1:38:53,  5.06it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11219/41242 [36:28<1:39:30,  5.03it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11219/41242 [36:28<1:39:30,  5.03it/s, training_loss=0.505]
Epoch 1:  27%|██▋       | 11220/41242 [36:28<1:38:50,  5.06it/s, training_loss=0.505]
Epoch 1:  27%|██▋       | 11220/41242 [36:28<1:38:50,  5.06it/s, training_loss=0.279]
Epoch 1:  27%|██▋       | 11221/41242 [36:28<1:37:55,  5.11it/s, training_loss=0.279]
Epoch 1:  27%|██▋       | 11221/41242 [36:28<1:37:55,  5.11it/s, training_loss=0.669]
Epoch 1:  27%|██▋       | 11222/41242 [36:28<1:37:09,  5.15it/s, training_loss=0.669]
Epoch 1:  27%|██▋       | 11222/41242 [36:29<1:37:09,  5.15it/s, training_loss=1.011]
Epoch 1:  27%|██▋       | 11223/41242 [36:29<1:37:32,  5.13it/s, training_loss=1.011]
Epoch 1:  27%|██▋       | 11223/41242 [36:29<1:37:32,  5.13it/s, training_loss=0.028]
Epoch 1:  27%|██▋       | 11224/41242 [36:29<1:37:37,  5.12it/s, training_loss=0.028]
Epoch 1:  27%|██▋       | 11224/41242 [36:29<1:37:37,  5.12it/s, training_loss=0.244]
Epoch 1:  27%|██▋       | 11225/41242 [36:29<1:40:58,  4.95it/s, training_loss=0.244]
Epoch 1:  27%|██▋       | 11225/41242 [36:29<1:40:58,  4.95it/s, training_loss=0.050]
Epoch 1:  27%|██▋       | 11226/41242 [36:29<1:40:31,  4.98it/s, training_loss=0.050]
Epoch 1:  27%|██▋       | 11226/41242 [36:29<1:40:31,  4.98it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11227/41242 [36:29<1:40:21,  4.98it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11227/41242 [36:30<1:40:21,  4.98it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11228/41242 [36:30<1:39:28,  5.03it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11228/41242 [36:30<1:39:28,  5.03it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11229/41242 [36:30<1:37:01,  5.16it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11229/41242 [36:30<1:37:01,  5.16it/s, training_loss=0.032]
Epoch 1:  27%|██▋       | 11230/41242 [36:30<1:36:05,  5.21it/s, training_loss=0.032]
Epoch 1:  27%|██▋       | 11230/41242 [36:30<1:36:05,  5.21it/s, training_loss=0.020]
Epoch 1:  27%|██▋       | 11231/41242 [36:30<1:36:50,  5.16it/s, training_loss=0.020]
Epoch 1:  27%|██▋       | 11231/41242 [36:30<1:36:50,  5.16it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11232/41242 [36:30<1:35:38,  5.23it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11232/41242 [36:30<1:35:38,  5.23it/s, training_loss=0.285]
Epoch 1:  27%|██▋       | 11233/41242 [36:31<1:36:38,  5.18it/s, training_loss=0.285]
Epoch 1:  27%|██▋       | 11233/41242 [36:31<1:36:38,  5.18it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11234/41242 [36:31<1:36:39,  5.17it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11234/41242 [36:31<1:36:39,  5.17it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11235/41242 [36:31<1:36:35,  5.18it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11235/41242 [36:31<1:36:35,  5.18it/s, training_loss=0.017]
Epoch 1:  27%|██▋       | 11236/41242 [36:31<1:37:55,  5.11it/s, training_loss=0.017]
Epoch 1:  27%|██▋       | 11236/41242 [36:31<1:37:55,  5.11it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11237/41242 [36:31<1:38:04,  5.10it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11237/41242 [36:31<1:38:04,  5.10it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11238/41242 [36:31<1:38:24,  5.08it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11238/41242 [36:32<1:38:24,  5.08it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11239/41242 [36:32<1:37:16,  5.14it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11239/41242 [36:32<1:37:16,  5.14it/s, training_loss=0.702]
Epoch 1:  27%|██▋       | 11240/41242 [36:32<1:37:02,  5.15it/s, training_loss=0.702]
Epoch 1:  27%|██▋       | 11240/41242 [36:32<1:37:02,  5.15it/s, training_loss=0.087]
Epoch 1:  27%|██▋       | 11241/41242 [36:32<1:37:48,  5.11it/s, training_loss=0.087]
Epoch 1:  27%|██▋       | 11241/41242 [36:32<1:37:48,  5.11it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11242/41242 [36:32<1:36:54,  5.16it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11242/41242 [36:32<1:36:54,  5.16it/s, training_loss=0.073]
Epoch 1:  27%|██▋       | 11243/41242 [36:32<1:36:35,  5.18it/s, training_loss=0.073]
Epoch 1:  27%|██▋       | 11243/41242 [36:33<1:36:35,  5.18it/s, training_loss=0.038]
Epoch 1:  27%|██▋       | 11244/41242 [36:33<1:36:16,  5.19it/s, training_loss=0.038]
Epoch 1:  27%|██▋       | 11244/41242 [36:33<1:36:16,  5.19it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11245/41242 [36:33<1:35:34,  5.23it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11245/41242 [36:33<1:35:34,  5.23it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11246/41242 [36:33<1:35:06,  5.26it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11246/41242 [36:33<1:35:06,  5.26it/s, training_loss=0.023]
Epoch 1:  27%|██▋       | 11247/41242 [36:33<1:35:13,  5.25it/s, training_loss=0.023]
Epoch 1:  27%|██▋       | 11247/41242 [36:33<1:35:13,  5.25it/s, training_loss=0.679]
Epoch 1:  27%|██▋       | 11248/41242 [36:33<1:36:45,  5.17it/s, training_loss=0.679]
Epoch 1:  27%|██▋       | 11248/41242 [36:34<1:36:45,  5.17it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 11249/41242 [36:34<1:38:00,  5.10it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 11249/41242 [36:34<1:38:00,  5.10it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11250/41242 [36:34<1:39:21,  5.03it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11250/41242 [36:34<1:39:21,  5.03it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11251/41242 [36:34<1:39:28,  5.03it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11251/41242 [36:34<1:39:28,  5.03it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11252/41242 [36:34<1:39:34,  5.02it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11252/41242 [36:34<1:39:34,  5.02it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11253/41242 [36:34<1:38:24,  5.08it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11253/41242 [36:35<1:38:24,  5.08it/s, training_loss=0.186]
Epoch 1:  27%|██▋       | 11254/41242 [36:35<1:39:20,  5.03it/s, training_loss=0.186]
Epoch 1:  27%|██▋       | 11254/41242 [36:35<1:39:20,  5.03it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11255/41242 [36:35<1:38:17,  5.08it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11255/41242 [36:35<1:38:17,  5.08it/s, training_loss=0.062]
Epoch 1:  27%|██▋       | 11256/41242 [36:35<1:38:59,  5.05it/s, training_loss=0.062]
Epoch 1:  27%|██▋       | 11256/41242 [36:35<1:38:59,  5.05it/s, training_loss=0.568]
Epoch 1:  27%|██▋       | 11257/41242 [36:35<1:39:35,  5.02it/s, training_loss=0.568]
Epoch 1:  27%|██▋       | 11257/41242 [36:35<1:39:35,  5.02it/s, training_loss=0.178]
Epoch 1:  27%|██▋       | 11258/41242 [36:35<1:39:15,  5.03it/s, training_loss=0.178]
Epoch 1:  27%|██▋       | 11258/41242 [36:36<1:39:15,  5.03it/s, training_loss=0.070]
Epoch 1:  27%|██▋       | 11259/41242 [36:36<1:38:58,  5.05it/s, training_loss=0.070]
Epoch 1:  27%|██▋       | 11259/41242 [36:36<1:38:58,  5.05it/s, training_loss=0.011]
Epoch 1:  27%|██▋       | 11260/41242 [36:36<1:39:27,  5.02it/s, training_loss=0.011]
Epoch 1:  27%|██▋       | 11260/41242 [36:36<1:39:27,  5.02it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11261/41242 [36:36<1:39:01,  5.05it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11261/41242 [36:36<1:39:01,  5.05it/s, training_loss=0.030]
Epoch 1:  27%|██▋       | 11262/41242 [36:36<1:38:58,  5.05it/s, training_loss=0.030]
Epoch 1:  27%|██▋       | 11262/41242 [36:36<1:38:58,  5.05it/s, training_loss=0.001]
Epoch 1:  27%|██▋       | 11263/41242 [36:36<1:39:12,  5.04it/s, training_loss=0.001]
Epoch 1:  27%|██▋       | 11263/41242 [36:37<1:39:12,  5.04it/s, training_loss=0.425]
Epoch 1:  27%|██▋       | 11264/41242 [36:37<1:39:51,  5.00it/s, training_loss=0.425]
Epoch 1:  27%|██▋       | 11264/41242 [36:37<1:39:51,  5.00it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11265/41242 [36:37<1:38:38,  5.06it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11265/41242 [36:37<1:38:38,  5.06it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11266/41242 [36:37<1:36:54,  5.16it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11266/41242 [36:37<1:36:54,  5.16it/s, training_loss=0.174]
Epoch 1:  27%|██▋       | 11267/41242 [36:37<1:37:05,  5.15it/s, training_loss=0.174]
Epoch 1:  27%|██▋       | 11267/41242 [36:37<1:37:05,  5.15it/s, training_loss=0.377]
Epoch 1:  27%|██▋       | 11268/41242 [36:37<1:37:34,  5.12it/s, training_loss=0.377]
Epoch 1:  27%|██▋       | 11268/41242 [36:38<1:37:34,  5.12it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 11269/41242 [36:38<1:38:41,  5.06it/s, training_loss=0.009]
Epoch 1:  27%|██▋       | 11269/41242 [36:38<1:38:41,  5.06it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11270/41242 [36:38<1:36:55,  5.15it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11270/41242 [36:38<1:36:55,  5.15it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11271/41242 [36:38<1:36:25,  5.18it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11271/41242 [36:38<1:36:25,  5.18it/s, training_loss=0.981]
Epoch 1:  27%|██▋       | 11272/41242 [36:38<1:36:49,  5.16it/s, training_loss=0.981]
Epoch 1:  27%|██▋       | 11272/41242 [36:38<1:36:49,  5.16it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11273/41242 [36:38<1:35:31,  5.23it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11273/41242 [36:39<1:35:31,  5.23it/s, training_loss=0.053]
Epoch 1:  27%|██▋       | 11274/41242 [36:39<1:35:56,  5.21it/s, training_loss=0.053]
Epoch 1:  27%|██▋       | 11274/41242 [36:39<1:35:56,  5.21it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11275/41242 [36:39<1:35:05,  5.25it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11275/41242 [36:39<1:35:05,  5.25it/s, training_loss=0.020]
Epoch 1:  27%|██▋       | 11276/41242 [36:39<1:35:11,  5.25it/s, training_loss=0.020]
Epoch 1:  27%|██▋       | 11276/41242 [36:39<1:35:11,  5.25it/s, training_loss=0.122]
Epoch 1:  27%|██▋       | 11277/41242 [36:39<1:38:45,  5.06it/s, training_loss=0.122]
Epoch 1:  27%|██▋       | 11277/41242 [36:39<1:38:45,  5.06it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11278/41242 [36:39<1:37:09,  5.14it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11278/41242 [36:39<1:37:09,  5.14it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11279/41242 [36:39<1:35:48,  5.21it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11279/41242 [36:40<1:35:48,  5.21it/s, training_loss=0.013]
Epoch 1:  27%|██▋       | 11280/41242 [36:40<1:35:44,  5.22it/s, training_loss=0.013]
Epoch 1:  27%|██▋       | 11280/41242 [36:40<1:35:44,  5.22it/s, training_loss=0.542]
Epoch 1:  27%|██▋       | 11281/41242 [36:40<1:36:14,  5.19it/s, training_loss=0.542]
Epoch 1:  27%|██▋       | 11281/41242 [36:40<1:36:14,  5.19it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11282/41242 [36:40<1:34:37,  5.28it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11282/41242 [36:40<1:34:37,  5.28it/s, training_loss=0.031]
Epoch 1:  27%|██▋       | 11283/41242 [36:40<1:34:40,  5.27it/s, training_loss=0.031]
Epoch 1:  27%|██▋       | 11283/41242 [36:40<1:34:40,  5.27it/s, training_loss=0.011]
Epoch 1:  27%|██▋       | 11284/41242 [36:40<1:34:42,  5.27it/s, training_loss=0.011]
Epoch 1:  27%|██▋       | 11284/41242 [36:41<1:34:42,  5.27it/s, training_loss=0.014]
Epoch 1:  27%|██▋       | 11285/41242 [36:41<1:35:03,  5.25it/s, training_loss=0.014]
Epoch 1:  27%|██▋       | 11285/41242 [36:41<1:35:03,  5.25it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11286/41242 [36:41<1:36:44,  5.16it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11286/41242 [36:41<1:36:44,  5.16it/s, training_loss=0.030]
Epoch 1:  27%|██▋       | 11287/41242 [36:41<1:36:43,  5.16it/s, training_loss=0.030]
Epoch 1:  27%|██▋       | 11287/41242 [36:41<1:36:43,  5.16it/s, training_loss=0.060]
Epoch 1:  27%|██▋       | 11288/41242 [36:41<1:37:25,  5.12it/s, training_loss=0.060]
Epoch 1:  27%|██▋       | 11288/41242 [36:41<1:37:25,  5.12it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11289/41242 [36:41<1:36:16,  5.19it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11289/41242 [36:42<1:36:16,  5.19it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11290/41242 [36:42<1:36:14,  5.19it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11290/41242 [36:42<1:36:14,  5.19it/s, training_loss=0.125]
Epoch 1:  27%|██▋       | 11291/41242 [36:42<1:36:04,  5.20it/s, training_loss=0.125]
Epoch 1:  27%|██▋       | 11291/41242 [36:42<1:36:04,  5.20it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11292/41242 [36:42<1:35:19,  5.24it/s, training_loss=0.008]
Epoch 1:  27%|██▋       | 11292/41242 [36:42<1:35:19,  5.24it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11293/41242 [36:42<1:34:18,  5.29it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11293/41242 [36:42<1:34:18,  5.29it/s, training_loss=0.152]
Epoch 1:  27%|██▋       | 11294/41242 [36:42<1:34:33,  5.28it/s, training_loss=0.152]
Epoch 1:  27%|██▋       | 11294/41242 [36:43<1:34:33,  5.28it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11295/41242 [36:43<1:33:38,  5.33it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11295/41242 [36:43<1:33:38,  5.33it/s, training_loss=0.117]
Epoch 1:  27%|██▋       | 11296/41242 [36:43<1:33:30,  5.34it/s, training_loss=0.117]
Epoch 1:  27%|██▋       | 11296/41242 [36:43<1:33:30,  5.34it/s, training_loss=0.022]
Epoch 1:  27%|██▋       | 11297/41242 [36:43<1:34:43,  5.27it/s, training_loss=0.022]
Epoch 1:  27%|██▋       | 11297/41242 [36:43<1:34:43,  5.27it/s, training_loss=0.165]
Epoch 1:  27%|██▋       | 11298/41242 [36:43<1:34:58,  5.25it/s, training_loss=0.165]
Epoch 1:  27%|██▋       | 11298/41242 [36:43<1:34:58,  5.25it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11299/41242 [36:43<1:34:34,  5.28it/s, training_loss=0.002]
Epoch 1:  27%|██▋       | 11299/41242 [36:43<1:34:34,  5.28it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11300/41242 [36:43<1:35:04,  5.25it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11300/41242 [36:44<1:35:04,  5.25it/s, training_loss=0.113]
Epoch 1:  27%|██▋       | 11301/41242 [36:44<1:35:16,  5.24it/s, training_loss=0.113]
Epoch 1:  27%|██▋       | 11301/41242 [36:44<1:35:16,  5.24it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11302/41242 [36:44<1:34:29,  5.28it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11302/41242 [36:44<1:34:29,  5.28it/s, training_loss=0.664]
Epoch 1:  27%|██▋       | 11303/41242 [36:44<1:34:58,  5.25it/s, training_loss=0.664]
Epoch 1:  27%|██▋       | 11303/41242 [36:44<1:34:58,  5.25it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11304/41242 [36:44<1:34:28,  5.28it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11304/41242 [36:44<1:34:28,  5.28it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11305/41242 [36:44<1:33:58,  5.31it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11305/41242 [36:45<1:33:58,  5.31it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11306/41242 [36:45<1:33:33,  5.33it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11306/41242 [36:45<1:33:33,  5.33it/s, training_loss=0.038]
Epoch 1:  27%|██▋       | 11307/41242 [36:45<1:33:42,  5.32it/s, training_loss=0.038]
Epoch 1:  27%|██▋       | 11307/41242 [36:45<1:33:42,  5.32it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11308/41242 [36:45<1:34:06,  5.30it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11308/41242 [36:45<1:34:06,  5.30it/s, training_loss=0.020]
Epoch 1:  27%|██▋       | 11309/41242 [36:45<1:35:03,  5.25it/s, training_loss=0.020]
Epoch 1:  27%|██▋       | 11309/41242 [36:45<1:35:03,  5.25it/s, training_loss=0.306]
Epoch 1:  27%|██▋       | 11310/41242 [36:45<1:36:41,  5.16it/s, training_loss=0.306]
Epoch 1:  27%|██▋       | 11310/41242 [36:46<1:36:41,  5.16it/s, training_loss=0.593]
Epoch 1:  27%|██▋       | 11311/41242 [36:46<1:37:34,  5.11it/s, training_loss=0.593]
Epoch 1:  27%|██▋       | 11311/41242 [36:46<1:37:34,  5.11it/s, training_loss=0.050]
Epoch 1:  27%|██▋       | 11312/41242 [36:46<1:37:12,  5.13it/s, training_loss=0.050]
Epoch 1:  27%|██▋       | 11312/41242 [36:46<1:37:12,  5.13it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11313/41242 [36:46<1:36:47,  5.15it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11313/41242 [36:46<1:36:47,  5.15it/s, training_loss=0.084]
Epoch 1:  27%|██▋       | 11314/41242 [36:46<1:37:14,  5.13it/s, training_loss=0.084]
Epoch 1:  27%|██▋       | 11314/41242 [36:46<1:37:14,  5.13it/s, training_loss=0.382]
Epoch 1:  27%|██▋       | 11315/41242 [36:46<1:38:20,  5.07it/s, training_loss=0.382]
Epoch 1:  27%|██▋       | 11315/41242 [36:47<1:38:20,  5.07it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11316/41242 [36:47<1:37:59,  5.09it/s, training_loss=0.004]
Epoch 1:  27%|██▋       | 11316/41242 [36:47<1:37:59,  5.09it/s, training_loss=0.470]
Epoch 1:  27%|██▋       | 11317/41242 [36:47<1:37:41,  5.11it/s, training_loss=0.470]
Epoch 1:  27%|██▋       | 11317/41242 [36:47<1:37:41,  5.11it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11318/41242 [36:47<1:37:40,  5.11it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11318/41242 [36:47<1:37:40,  5.11it/s, training_loss=0.456]
Epoch 1:  27%|██▋       | 11319/41242 [36:47<1:37:43,  5.10it/s, training_loss=0.456]
Epoch 1:  27%|██▋       | 11319/41242 [36:47<1:37:43,  5.10it/s, training_loss=0.020]
Epoch 1:  27%|██▋       | 11320/41242 [36:47<1:38:38,  5.06it/s, training_loss=0.020]
Epoch 1:  27%|██▋       | 11320/41242 [36:48<1:38:38,  5.06it/s, training_loss=0.627]
Epoch 1:  27%|██▋       | 11321/41242 [36:48<1:38:24,  5.07it/s, training_loss=0.627]
Epoch 1:  27%|██▋       | 11321/41242 [36:48<1:38:24,  5.07it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11322/41242 [36:48<1:38:53,  5.04it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11322/41242 [36:48<1:38:53,  5.04it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11323/41242 [36:48<1:39:22,  5.02it/s, training_loss=0.005]
Epoch 1:  27%|██▋       | 11323/41242 [36:48<1:39:22,  5.02it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11324/41242 [36:48<1:39:05,  5.03it/s, training_loss=0.006]
Epoch 1:  27%|██▋       | 11324/41242 [36:48<1:39:05,  5.03it/s, training_loss=0.011]
Epoch 1:  27%|██▋       | 11325/41242 [36:48<1:38:16,  5.07it/s, training_loss=0.011]
Epoch 1:  27%|██▋       | 11325/41242 [36:49<1:38:16,  5.07it/s, training_loss=0.041]
Epoch 1:  27%|██▋       | 11326/41242 [36:49<1:39:10,  5.03it/s, training_loss=0.041]
Epoch 1:  27%|██▋       | 11326/41242 [36:49<1:39:10,  5.03it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11327/41242 [36:49<1:38:48,  5.05it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11327/41242 [36:49<1:38:48,  5.05it/s, training_loss=0.015]
Epoch 1:  27%|██▋       | 11328/41242 [36:49<1:39:00,  5.04it/s, training_loss=0.015]
Epoch 1:  27%|██▋       | 11328/41242 [36:49<1:39:00,  5.04it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11329/41242 [36:49<1:40:30,  4.96it/s, training_loss=0.007]
Epoch 1:  27%|██▋       | 11329/41242 [36:49<1:40:30,  4.96it/s, training_loss=0.026]
Epoch 1:  27%|██▋       | 11330/41242 [36:49<1:40:33,  4.96it/s, training_loss=0.026]
Epoch 1:  27%|██▋       | 11330/41242 [36:50<1:40:33,  4.96it/s, training_loss=0.042]
Epoch 1:  27%|██▋       | 11331/41242 [36:50<1:38:53,  5.04it/s, training_loss=0.042]
Epoch 1:  27%|██▋       | 11331/41242 [36:50<1:38:53,  5.04it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11332/41242 [36:50<1:38:38,  5.05it/s, training_loss=0.003]
Epoch 1:  27%|██▋       | 11332/41242 [36:50<1:38:38,  5.05it/s, training_loss=0.012]
Epoch 1:  27%|██▋       | 11333/41242 [36:50<1:39:22,  5.02it/s, training_loss=0.012]
Epoch 1:  27%|██▋       | 11333/41242 [36:50<1:39:22,  5.02it/s, training_loss=0.016]
Epoch 1:  27%|██▋       | 11334/41242 [36:50<1:41:11,  4.93it/s, training_loss=0.016]
Epoch 1:  27%|██▋       | 11334/41242 [36:50<1:41:11,  4.93it/s, training_loss=0.491]
Epoch 1:  27%|██▋       | 11335/41242 [36:50<1:39:49,  4.99it/s, training_loss=0.491]
Epoch 1:  27%|██▋       | 11335/41242 [36:51<1:39:49,  4.99it/s, training_loss=0.168]
Epoch 1:  27%|██▋       | 11336/41242 [36:51<1:41:11,  4.93it/s, training_loss=0.168]
Epoch 1:  27%|██▋       | 11336/41242 [36:51<1:41:11,  4.93it/s, training_loss=0.104]
Epoch 1:  27%|██▋       | 11337/41242 [36:51<1:40:56,  4.94it/s, training_loss=0.104]
Epoch 1:  27%|██▋       | 11337/41242 [36:51<1:40:56,  4.94it/s, training_loss=0.010]
Epoch 1:  27%|██▋       | 11338/41242 [36:51<1:41:08,  4.93it/s, training_loss=0.010]
Epoch 1:  27%|██▋       | 11338/41242 [36:51<1:41:08,  4.93it/s, training_loss=0.065]
Epoch 1:  27%|██▋       | 11339/41242 [36:51<1:40:34,  4.96it/s, training_loss=0.065]
Epoch 1:  27%|██▋       | 11339/41242 [36:51<1:40:34,  4.96it/s, training_loss=0.633]
Epoch 1:  27%|██▋       | 11340/41242 [36:51<1:40:25,  4.96it/s, training_loss=0.633]
Epoch 1:  27%|██▋       | 11340/41242 [36:52<1:40:25,  4.96it/s, training_loss=0.017]
Epoch 1:  27%|██▋       | 11341/41242 [36:52<1:40:48,  4.94it/s, training_loss=0.017]
Epoch 1:  27%|██▋       | 11341/41242 [36:52<1:40:48,  4.94it/s, training_loss=0.016]
Epoch 1:  28%|██▊       | 11342/41242 [36:52<1:41:20,  4.92it/s, training_loss=0.016]
Epoch 1:  28%|██▊       | 11342/41242 [36:52<1:41:20,  4.92it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11343/41242 [36:52<1:39:49,  4.99it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11343/41242 [36:52<1:39:49,  4.99it/s, training_loss=0.359]
Epoch 1:  28%|██▊       | 11344/41242 [36:52<1:38:51,  5.04it/s, training_loss=0.359]
Epoch 1:  28%|██▊       | 11344/41242 [36:52<1:38:51,  5.04it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11345/41242 [36:52<1:38:42,  5.05it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11345/41242 [36:53<1:38:42,  5.05it/s, training_loss=0.013]
Epoch 1:  28%|██▊       | 11346/41242 [36:53<1:39:10,  5.02it/s, training_loss=0.013]
Epoch 1:  28%|██▊       | 11346/41242 [36:53<1:39:10,  5.02it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11347/41242 [36:53<1:37:42,  5.10it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11347/41242 [36:53<1:37:42,  5.10it/s, training_loss=0.417]
Epoch 1:  28%|██▊       | 11348/41242 [36:53<1:38:11,  5.07it/s, training_loss=0.417]
Epoch 1:  28%|██▊       | 11348/41242 [36:53<1:38:11,  5.07it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11349/41242 [36:53<1:40:09,  4.97it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11349/41242 [36:53<1:40:09,  4.97it/s, training_loss=0.443]
Epoch 1:  28%|██▊       | 11350/41242 [36:53<1:38:47,  5.04it/s, training_loss=0.443]
Epoch 1:  28%|██▊       | 11350/41242 [36:54<1:38:47,  5.04it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11351/41242 [36:54<1:36:50,  5.14it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11351/41242 [36:54<1:36:50,  5.14it/s, training_loss=0.081]
Epoch 1:  28%|██▊       | 11352/41242 [36:54<1:37:42,  5.10it/s, training_loss=0.081]
Epoch 1:  28%|██▊       | 11352/41242 [36:54<1:37:42,  5.10it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11353/41242 [36:54<1:37:19,  5.12it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11353/41242 [36:54<1:37:19,  5.12it/s, training_loss=0.043]
Epoch 1:  28%|██▊       | 11354/41242 [36:54<1:36:57,  5.14it/s, training_loss=0.043]
Epoch 1:  28%|██▊       | 11354/41242 [36:54<1:36:57,  5.14it/s, training_loss=0.015]
Epoch 1:  28%|██▊       | 11355/41242 [36:54<1:37:51,  5.09it/s, training_loss=0.015]
Epoch 1:  28%|██▊       | 11355/41242 [36:55<1:37:51,  5.09it/s, training_loss=0.015]
Epoch 1:  28%|██▊       | 11356/41242 [36:55<1:38:03,  5.08it/s, training_loss=0.015]
Epoch 1:  28%|██▊       | 11356/41242 [36:55<1:38:03,  5.08it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11357/41242 [36:55<1:37:09,  5.13it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11357/41242 [36:55<1:37:09,  5.13it/s, training_loss=0.062]
Epoch 1:  28%|██▊       | 11358/41242 [36:55<1:35:45,  5.20it/s, training_loss=0.062]
Epoch 1:  28%|██▊       | 11358/41242 [36:55<1:35:45,  5.20it/s, training_loss=0.068]
Epoch 1:  28%|██▊       | 11359/41242 [36:55<1:35:26,  5.22it/s, training_loss=0.068]
Epoch 1:  28%|██▊       | 11359/41242 [36:55<1:35:26,  5.22it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11360/41242 [36:55<1:34:51,  5.25it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11360/41242 [36:55<1:34:51,  5.25it/s, training_loss=0.127]
Epoch 1:  28%|██▊       | 11361/41242 [36:55<1:34:33,  5.27it/s, training_loss=0.127]
Epoch 1:  28%|██▊       | 11361/41242 [36:56<1:34:33,  5.27it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11362/41242 [36:56<1:34:47,  5.25it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11362/41242 [36:56<1:34:47,  5.25it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11363/41242 [36:56<1:36:29,  5.16it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11363/41242 [36:56<1:36:29,  5.16it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11364/41242 [36:56<1:35:53,  5.19it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11364/41242 [36:56<1:35:53,  5.19it/s, training_loss=0.075]
Epoch 1:  28%|██▊       | 11365/41242 [36:56<1:35:45,  5.20it/s, training_loss=0.075]
Epoch 1:  28%|██▊       | 11365/41242 [36:56<1:35:45,  5.20it/s, training_loss=0.041]
Epoch 1:  28%|██▊       | 11366/41242 [36:56<1:35:54,  5.19it/s, training_loss=0.041]
Epoch 1:  28%|██▊       | 11366/41242 [36:57<1:35:54,  5.19it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11367/41242 [36:57<1:34:30,  5.27it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11367/41242 [36:57<1:34:30,  5.27it/s, training_loss=0.038]
Epoch 1:  28%|██▊       | 11368/41242 [36:57<1:34:12,  5.29it/s, training_loss=0.038]
Epoch 1:  28%|██▊       | 11368/41242 [36:57<1:34:12,  5.29it/s, training_loss=0.244]
Epoch 1:  28%|██▊       | 11369/41242 [36:57<1:36:49,  5.14it/s, training_loss=0.244]
Epoch 1:  28%|██▊       | 11369/41242 [36:57<1:36:49,  5.14it/s, training_loss=0.024]
Epoch 1:  28%|██▊       | 11370/41242 [36:57<1:36:43,  5.15it/s, training_loss=0.024]
Epoch 1:  28%|██▊       | 11370/41242 [36:57<1:36:43,  5.15it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11371/41242 [36:57<1:36:38,  5.15it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11371/41242 [36:58<1:36:38,  5.15it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11372/41242 [36:58<1:35:32,  5.21it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11372/41242 [36:58<1:35:32,  5.21it/s, training_loss=0.025]
Epoch 1:  28%|██▊       | 11373/41242 [36:58<1:35:23,  5.22it/s, training_loss=0.025]
Epoch 1:  28%|██▊       | 11373/41242 [36:58<1:35:23,  5.22it/s, training_loss=0.018]
Epoch 1:  28%|██▊       | 11374/41242 [36:58<1:34:59,  5.24it/s, training_loss=0.018]
Epoch 1:  28%|██▊       | 11374/41242 [36:58<1:34:59,  5.24it/s, training_loss=0.132]
Epoch 1:  28%|██▊       | 11375/41242 [36:58<1:35:23,  5.22it/s, training_loss=0.132]
Epoch 1:  28%|██▊       | 11375/41242 [36:58<1:35:23,  5.22it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11376/41242 [36:58<1:36:40,  5.15it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11376/41242 [36:59<1:36:40,  5.15it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11377/41242 [36:59<1:35:44,  5.20it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11377/41242 [36:59<1:35:44,  5.20it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11378/41242 [36:59<1:34:32,  5.26it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11378/41242 [36:59<1:34:32,  5.26it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11379/41242 [36:59<1:33:47,  5.31it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11379/41242 [36:59<1:33:47,  5.31it/s, training_loss=0.168]
Epoch 1:  28%|██▊       | 11380/41242 [36:59<1:34:25,  5.27it/s, training_loss=0.168]
Epoch 1:  28%|██▊       | 11380/41242 [36:59<1:34:25,  5.27it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11381/41242 [36:59<1:34:01,  5.29it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11381/41242 [36:59<1:34:01,  5.29it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11382/41242 [36:59<1:33:22,  5.33it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11382/41242 [37:00<1:33:22,  5.33it/s, training_loss=0.055]
Epoch 1:  28%|██▊       | 11383/41242 [37:00<1:34:03,  5.29it/s, training_loss=0.055]
Epoch 1:  28%|██▊       | 11383/41242 [37:00<1:34:03,  5.29it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11384/41242 [37:00<1:33:32,  5.32it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11384/41242 [37:00<1:33:32,  5.32it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11385/41242 [37:00<1:33:45,  5.31it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11385/41242 [37:00<1:33:45,  5.31it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11386/41242 [37:00<1:34:36,  5.26it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11386/41242 [37:00<1:34:36,  5.26it/s, training_loss=0.131]
Epoch 1:  28%|██▊       | 11387/41242 [37:00<1:34:23,  5.27it/s, training_loss=0.131]
Epoch 1:  28%|██▊       | 11387/41242 [37:01<1:34:23,  5.27it/s, training_loss=0.316]
Epoch 1:  28%|██▊       | 11388/41242 [37:01<1:37:15,  5.12it/s, training_loss=0.316]
Epoch 1:  28%|██▊       | 11388/41242 [37:01<1:37:15,  5.12it/s, training_loss=0.140]
Epoch 1:  28%|██▊       | 11389/41242 [37:01<1:38:02,  5.08it/s, training_loss=0.140]
Epoch 1:  28%|██▊       | 11389/41242 [37:01<1:38:02,  5.08it/s, training_loss=0.011]
Epoch 1:  28%|██▊       | 11390/41242 [37:01<1:37:53,  5.08it/s, training_loss=0.011]
Epoch 1:  28%|██▊       | 11390/41242 [37:01<1:37:53,  5.08it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11391/41242 [37:01<1:36:08,  5.18it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11391/41242 [37:01<1:36:08,  5.18it/s, training_loss=0.511]
Epoch 1:  28%|██▊       | 11392/41242 [37:01<1:36:04,  5.18it/s, training_loss=0.511]
Epoch 1:  28%|██▊       | 11392/41242 [37:02<1:36:04,  5.18it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11393/41242 [37:02<1:35:18,  5.22it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11393/41242 [37:02<1:35:18,  5.22it/s, training_loss=0.011]
Epoch 1:  28%|██▊       | 11394/41242 [37:02<1:35:58,  5.18it/s, training_loss=0.011]
Epoch 1:  28%|██▊       | 11394/41242 [37:02<1:35:58,  5.18it/s, training_loss=0.196]
Epoch 1:  28%|██▊       | 11395/41242 [37:02<1:37:42,  5.09it/s, training_loss=0.196]
Epoch 1:  28%|██▊       | 11395/41242 [37:02<1:37:42,  5.09it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11396/41242 [37:02<1:36:17,  5.17it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11396/41242 [37:02<1:36:17,  5.17it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11397/41242 [37:02<1:35:14,  5.22it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11397/41242 [37:03<1:35:14,  5.22it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11398/41242 [37:03<1:33:44,  5.31it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11398/41242 [37:03<1:33:44,  5.31it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11399/41242 [37:03<1:33:09,  5.34it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11399/41242 [37:03<1:33:09,  5.34it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11400/41242 [37:03<1:34:28,  5.26it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11400/41242 [37:03<1:34:28,  5.26it/s, training_loss=0.031]
Epoch 1:  28%|██▊       | 11401/41242 [37:03<1:36:56,  5.13it/s, training_loss=0.031]
Epoch 1:  28%|██▊       | 11401/41242 [37:03<1:36:56,  5.13it/s, training_loss=0.185]
Epoch 1:  28%|██▊       | 11402/41242 [37:03<1:37:58,  5.08it/s, training_loss=0.185]
Epoch 1:  28%|██▊       | 11402/41242 [37:04<1:37:58,  5.08it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11403/41242 [37:04<1:37:07,  5.12it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11403/41242 [37:04<1:37:07,  5.12it/s, training_loss=0.123]
Epoch 1:  28%|██▊       | 11404/41242 [37:04<1:36:01,  5.18it/s, training_loss=0.123]
Epoch 1:  28%|██▊       | 11404/41242 [37:04<1:36:01,  5.18it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11405/41242 [37:04<1:36:26,  5.16it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11405/41242 [37:04<1:36:26,  5.16it/s, training_loss=0.017]
Epoch 1:  28%|██▊       | 11406/41242 [37:04<1:36:40,  5.14it/s, training_loss=0.017]
Epoch 1:  28%|██▊       | 11406/41242 [37:04<1:36:40,  5.14it/s, training_loss=0.018]
Epoch 1:  28%|██▊       | 11407/41242 [37:04<1:37:04,  5.12it/s, training_loss=0.018]
Epoch 1:  28%|██▊       | 11407/41242 [37:05<1:37:04,  5.12it/s, training_loss=0.322]
Epoch 1:  28%|██▊       | 11408/41242 [37:05<1:39:43,  4.99it/s, training_loss=0.322]
Epoch 1:  28%|██▊       | 11408/41242 [37:05<1:39:43,  4.99it/s, training_loss=0.015]
Epoch 1:  28%|██▊       | 11409/41242 [37:05<1:38:28,  5.05it/s, training_loss=0.015]
Epoch 1:  28%|██▊       | 11409/41242 [37:05<1:38:28,  5.05it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11410/41242 [37:05<1:38:07,  5.07it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11410/41242 [37:05<1:38:07,  5.07it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11411/41242 [37:05<1:37:48,  5.08it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11411/41242 [37:05<1:37:48,  5.08it/s, training_loss=0.092]
Epoch 1:  28%|██▊       | 11412/41242 [37:05<1:38:25,  5.05it/s, training_loss=0.092]
Epoch 1:  28%|██▊       | 11412/41242 [37:06<1:38:25,  5.05it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11413/41242 [37:06<1:39:31,  5.00it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11413/41242 [37:06<1:39:31,  5.00it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11414/41242 [37:06<1:39:18,  5.01it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11414/41242 [37:06<1:39:18,  5.01it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11415/41242 [37:06<1:38:48,  5.03it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11415/41242 [37:06<1:38:48,  5.03it/s, training_loss=0.058]
Epoch 1:  28%|██▊       | 11416/41242 [37:06<1:38:27,  5.05it/s, training_loss=0.058]
Epoch 1:  28%|██▊       | 11416/41242 [37:06<1:38:27,  5.05it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11417/41242 [37:06<1:38:30,  5.05it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11417/41242 [37:07<1:38:30,  5.05it/s, training_loss=0.021]
Epoch 1:  28%|██▊       | 11418/41242 [37:07<1:38:29,  5.05it/s, training_loss=0.021]
Epoch 1:  28%|██▊       | 11418/41242 [37:07<1:38:29,  5.05it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11419/41242 [37:07<1:37:57,  5.07it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11419/41242 [37:07<1:37:57,  5.07it/s, training_loss=0.753]
Epoch 1:  28%|██▊       | 11420/41242 [37:07<1:39:40,  4.99it/s, training_loss=0.753]
Epoch 1:  28%|██▊       | 11420/41242 [37:07<1:39:40,  4.99it/s, training_loss=0.782]
Epoch 1:  28%|██▊       | 11421/41242 [37:07<1:42:59,  4.83it/s, training_loss=0.782]
Epoch 1:  28%|██▊       | 11421/41242 [37:07<1:42:59,  4.83it/s, training_loss=0.649]
Epoch 1:  28%|██▊       | 11422/41242 [37:07<1:41:04,  4.92it/s, training_loss=0.649]
Epoch 1:  28%|██▊       | 11422/41242 [37:08<1:41:04,  4.92it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11423/41242 [37:08<1:42:07,  4.87it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11423/41242 [37:08<1:42:07,  4.87it/s, training_loss=0.021]
Epoch 1:  28%|██▊       | 11424/41242 [37:08<1:40:29,  4.95it/s, training_loss=0.021]
Epoch 1:  28%|██▊       | 11424/41242 [37:08<1:40:29,  4.95it/s, training_loss=0.011]
Epoch 1:  28%|██▊       | 11425/41242 [37:08<1:39:20,  5.00it/s, training_loss=0.011]
Epoch 1:  28%|██▊       | 11425/41242 [37:08<1:39:20,  5.00it/s, training_loss=0.035]
Epoch 1:  28%|██▊       | 11426/41242 [37:08<1:38:13,  5.06it/s, training_loss=0.035]
Epoch 1:  28%|██▊       | 11426/41242 [37:08<1:38:13,  5.06it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11427/41242 [37:08<1:39:10,  5.01it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11427/41242 [37:09<1:39:10,  5.01it/s, training_loss=0.894]
Epoch 1:  28%|██▊       | 11428/41242 [37:09<1:40:59,  4.92it/s, training_loss=0.894]
Epoch 1:  28%|██▊       | 11428/41242 [37:09<1:40:59,  4.92it/s, training_loss=0.062]
Epoch 1:  28%|██▊       | 11429/41242 [37:09<1:39:52,  4.97it/s, training_loss=0.062]
Epoch 1:  28%|██▊       | 11429/41242 [37:09<1:39:52,  4.97it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11430/41242 [37:09<1:38:21,  5.05it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11430/41242 [37:09<1:38:21,  5.05it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11431/41242 [37:09<1:37:06,  5.12it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11431/41242 [37:09<1:37:06,  5.12it/s, training_loss=0.042]
Epoch 1:  28%|██▊       | 11432/41242 [37:09<1:36:23,  5.15it/s, training_loss=0.042]
Epoch 1:  28%|██▊       | 11432/41242 [37:09<1:36:23,  5.15it/s, training_loss=0.089]
Epoch 1:  28%|██▊       | 11433/41242 [37:10<1:38:11,  5.06it/s, training_loss=0.089]
Epoch 1:  28%|██▊       | 11433/41242 [37:10<1:38:11,  5.06it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11434/41242 [37:10<1:36:37,  5.14it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11434/41242 [37:10<1:36:37,  5.14it/s, training_loss=0.385]
Epoch 1:  28%|██▊       | 11435/41242 [37:10<1:35:38,  5.19it/s, training_loss=0.385]
Epoch 1:  28%|██▊       | 11435/41242 [37:10<1:35:38,  5.19it/s, training_loss=0.030]
Epoch 1:  28%|██▊       | 11436/41242 [37:10<1:37:25,  5.10it/s, training_loss=0.030]
Epoch 1:  28%|██▊       | 11436/41242 [37:10<1:37:25,  5.10it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11437/41242 [37:10<1:35:44,  5.19it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11437/41242 [37:10<1:35:44,  5.19it/s, training_loss=0.013]
Epoch 1:  28%|██▊       | 11438/41242 [37:10<1:34:49,  5.24it/s, training_loss=0.013]
Epoch 1:  28%|██▊       | 11438/41242 [37:11<1:34:49,  5.24it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11439/41242 [37:11<1:35:30,  5.20it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11439/41242 [37:11<1:35:30,  5.20it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11440/41242 [37:11<1:36:14,  5.16it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11440/41242 [37:11<1:36:14,  5.16it/s, training_loss=0.457]
Epoch 1:  28%|██▊       | 11441/41242 [37:11<1:38:07,  5.06it/s, training_loss=0.457]
Epoch 1:  28%|██▊       | 11441/41242 [37:11<1:38:07,  5.06it/s, training_loss=0.127]
Epoch 1:  28%|██▊       | 11442/41242 [37:11<1:38:47,  5.03it/s, training_loss=0.127]
Epoch 1:  28%|██▊       | 11442/41242 [37:11<1:38:47,  5.03it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11443/41242 [37:11<1:37:30,  5.09it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11443/41242 [37:12<1:37:30,  5.09it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11444/41242 [37:12<1:38:26,  5.04it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11444/41242 [37:12<1:38:26,  5.04it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11445/41242 [37:12<1:39:02,  5.01it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11445/41242 [37:12<1:39:02,  5.01it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11446/41242 [37:12<1:38:45,  5.03it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11446/41242 [37:12<1:38:45,  5.03it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11447/41242 [37:12<1:37:47,  5.08it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11447/41242 [37:12<1:37:47,  5.08it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11448/41242 [37:12<1:38:44,  5.03it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11448/41242 [37:13<1:38:44,  5.03it/s, training_loss=0.248]
Epoch 1:  28%|██▊       | 11449/41242 [37:13<1:37:51,  5.07it/s, training_loss=0.248]
Epoch 1:  28%|██▊       | 11449/41242 [37:13<1:37:51,  5.07it/s, training_loss=0.013]
Epoch 1:  28%|██▊       | 11450/41242 [37:13<1:37:06,  5.11it/s, training_loss=0.013]
Epoch 1:  28%|██▊       | 11450/41242 [37:13<1:37:06,  5.11it/s, training_loss=0.222]
Epoch 1:  28%|██▊       | 11451/41242 [37:13<1:37:07,  5.11it/s, training_loss=0.222]
Epoch 1:  28%|██▊       | 11451/41242 [37:13<1:37:07,  5.11it/s, training_loss=0.411]
Epoch 1:  28%|██▊       | 11452/41242 [37:13<1:38:17,  5.05it/s, training_loss=0.411]
Epoch 1:  28%|██▊       | 11452/41242 [37:13<1:38:17,  5.05it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11453/41242 [37:13<1:37:48,  5.08it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11453/41242 [37:14<1:37:48,  5.08it/s, training_loss=0.025]
Epoch 1:  28%|██▊       | 11454/41242 [37:14<1:36:54,  5.12it/s, training_loss=0.025]
Epoch 1:  28%|██▊       | 11454/41242 [37:14<1:36:54,  5.12it/s, training_loss=0.027]
Epoch 1:  28%|██▊       | 11455/41242 [37:14<1:36:25,  5.15it/s, training_loss=0.027]
Epoch 1:  28%|██▊       | 11455/41242 [37:14<1:36:25,  5.15it/s, training_loss=0.011]
Epoch 1:  28%|██▊       | 11456/41242 [37:14<1:37:47,  5.08it/s, training_loss=0.011]
Epoch 1:  28%|██▊       | 11456/41242 [37:14<1:37:47,  5.08it/s, training_loss=0.596]
Epoch 1:  28%|██▊       | 11457/41242 [37:14<1:37:54,  5.07it/s, training_loss=0.596]
Epoch 1:  28%|██▊       | 11457/41242 [37:14<1:37:54,  5.07it/s, training_loss=0.079]
Epoch 1:  28%|██▊       | 11458/41242 [37:14<1:37:56,  5.07it/s, training_loss=0.079]
Epoch 1:  28%|██▊       | 11458/41242 [37:15<1:37:56,  5.07it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11459/41242 [37:15<1:37:38,  5.08it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11459/41242 [37:15<1:37:38,  5.08it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11460/41242 [37:15<1:36:06,  5.16it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11460/41242 [37:15<1:36:06,  5.16it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11461/41242 [37:15<1:37:30,  5.09it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11461/41242 [37:15<1:37:30,  5.09it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11462/41242 [37:15<1:37:07,  5.11it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11462/41242 [37:15<1:37:07,  5.11it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11463/41242 [37:15<1:36:45,  5.13it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11463/41242 [37:16<1:36:45,  5.13it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11464/41242 [37:16<1:35:48,  5.18it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11464/41242 [37:16<1:35:48,  5.18it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11465/41242 [37:16<1:36:37,  5.14it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11465/41242 [37:16<1:36:37,  5.14it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11466/41242 [37:16<1:37:03,  5.11it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11466/41242 [37:16<1:37:03,  5.11it/s, training_loss=0.697]
Epoch 1:  28%|██▊       | 11467/41242 [37:16<1:37:53,  5.07it/s, training_loss=0.697]
Epoch 1:  28%|██▊       | 11467/41242 [37:16<1:37:53,  5.07it/s, training_loss=0.073]
Epoch 1:  28%|██▊       | 11468/41242 [37:16<1:37:41,  5.08it/s, training_loss=0.073]
Epoch 1:  28%|██▊       | 11468/41242 [37:17<1:37:41,  5.08it/s, training_loss=0.063]
Epoch 1:  28%|██▊       | 11469/41242 [37:17<1:37:07,  5.11it/s, training_loss=0.063]
Epoch 1:  28%|██▊       | 11469/41242 [37:17<1:37:07,  5.11it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11470/41242 [37:17<1:35:26,  5.20it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11470/41242 [37:17<1:35:26,  5.20it/s, training_loss=0.029]
Epoch 1:  28%|██▊       | 11471/41242 [37:17<1:35:19,  5.20it/s, training_loss=0.029]
Epoch 1:  28%|██▊       | 11471/41242 [37:17<1:35:19,  5.20it/s, training_loss=0.023]
Epoch 1:  28%|██▊       | 11472/41242 [37:17<1:35:14,  5.21it/s, training_loss=0.023]
Epoch 1:  28%|██▊       | 11472/41242 [37:17<1:35:14,  5.21it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11473/41242 [37:17<1:33:55,  5.28it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11473/41242 [37:18<1:33:55,  5.28it/s, training_loss=0.508]
Epoch 1:  28%|██▊       | 11474/41242 [37:18<1:34:15,  5.26it/s, training_loss=0.508]
Epoch 1:  28%|██▊       | 11474/41242 [37:18<1:34:15,  5.26it/s, training_loss=0.015]
Epoch 1:  28%|██▊       | 11475/41242 [37:18<1:36:03,  5.16it/s, training_loss=0.015]
Epoch 1:  28%|██▊       | 11475/41242 [37:18<1:36:03,  5.16it/s, training_loss=0.122]
Epoch 1:  28%|██▊       | 11476/41242 [37:18<1:36:23,  5.15it/s, training_loss=0.122]
Epoch 1:  28%|██▊       | 11476/41242 [37:18<1:36:23,  5.15it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11477/41242 [37:18<1:35:45,  5.18it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11477/41242 [37:18<1:35:45,  5.18it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11478/41242 [37:18<1:35:38,  5.19it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11478/41242 [37:18<1:35:38,  5.19it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11479/41242 [37:18<1:34:44,  5.24it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11479/41242 [37:19<1:34:44,  5.24it/s, training_loss=0.028]
Epoch 1:  28%|██▊       | 11480/41242 [37:19<1:34:38,  5.24it/s, training_loss=0.028]
Epoch 1:  28%|██▊       | 11480/41242 [37:19<1:34:38,  5.24it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11481/41242 [37:19<1:34:17,  5.26it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11481/41242 [37:19<1:34:17,  5.26it/s, training_loss=0.233]
Epoch 1:  28%|██▊       | 11482/41242 [37:19<1:34:22,  5.26it/s, training_loss=0.233]
Epoch 1:  28%|██▊       | 11482/41242 [37:19<1:34:22,  5.26it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11483/41242 [37:19<1:34:48,  5.23it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11483/41242 [37:19<1:34:48,  5.23it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11484/41242 [37:19<1:34:52,  5.23it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11484/41242 [37:20<1:34:52,  5.23it/s, training_loss=0.125]
Epoch 1:  28%|██▊       | 11485/41242 [37:20<1:35:11,  5.21it/s, training_loss=0.125]
Epoch 1:  28%|██▊       | 11485/41242 [37:20<1:35:11,  5.21it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11486/41242 [37:20<1:34:22,  5.25it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11486/41242 [37:20<1:34:22,  5.25it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11487/41242 [37:20<1:33:38,  5.30it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11487/41242 [37:20<1:33:38,  5.30it/s, training_loss=0.180]
Epoch 1:  28%|██▊       | 11488/41242 [37:20<1:36:23,  5.14it/s, training_loss=0.180]
Epoch 1:  28%|██▊       | 11488/41242 [37:20<1:36:23,  5.14it/s, training_loss=0.022]
Epoch 1:  28%|██▊       | 11489/41242 [37:20<1:36:13,  5.15it/s, training_loss=0.022]
Epoch 1:  28%|██▊       | 11489/41242 [37:21<1:36:13,  5.15it/s, training_loss=0.025]
Epoch 1:  28%|██▊       | 11490/41242 [37:21<1:36:29,  5.14it/s, training_loss=0.025]
Epoch 1:  28%|██▊       | 11490/41242 [37:21<1:36:29,  5.14it/s, training_loss=0.269]
Epoch 1:  28%|██▊       | 11491/41242 [37:21<1:36:41,  5.13it/s, training_loss=0.269]
Epoch 1:  28%|██▊       | 11491/41242 [37:21<1:36:41,  5.13it/s, training_loss=0.016]
Epoch 1:  28%|██▊       | 11492/41242 [37:21<1:35:47,  5.18it/s, training_loss=0.016]
Epoch 1:  28%|██▊       | 11492/41242 [37:21<1:35:47,  5.18it/s, training_loss=0.014]
Epoch 1:  28%|██▊       | 11493/41242 [37:21<1:37:35,  5.08it/s, training_loss=0.014]
Epoch 1:  28%|██▊       | 11493/41242 [37:21<1:37:35,  5.08it/s, training_loss=0.013]
Epoch 1:  28%|██▊       | 11494/41242 [37:21<1:39:11,  5.00it/s, training_loss=0.013]
Epoch 1:  28%|██▊       | 11494/41242 [37:22<1:39:11,  5.00it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11495/41242 [37:22<1:37:21,  5.09it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11495/41242 [37:22<1:37:21,  5.09it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11496/41242 [37:22<1:36:18,  5.15it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11496/41242 [37:22<1:36:18,  5.15it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11497/41242 [37:22<1:37:09,  5.10it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11497/41242 [37:22<1:37:09,  5.10it/s, training_loss=0.242]
Epoch 1:  28%|██▊       | 11498/41242 [37:22<1:37:33,  5.08it/s, training_loss=0.242]
Epoch 1:  28%|██▊       | 11498/41242 [37:22<1:37:33,  5.08it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11499/41242 [37:22<1:38:39,  5.02it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11499/41242 [37:23<1:38:39,  5.02it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11500/41242 [37:23<1:40:15,  4.94it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11500/41242 [37:23<1:40:15,  4.94it/s, training_loss=0.084]
Epoch 1:  28%|██▊       | 11501/41242 [37:23<1:39:19,  4.99it/s, training_loss=0.084]
Epoch 1:  28%|██▊       | 11501/41242 [37:23<1:39:19,  4.99it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11502/41242 [37:23<1:38:51,  5.01it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11502/41242 [37:23<1:38:51,  5.01it/s, training_loss=0.255]
Epoch 1:  28%|██▊       | 11503/41242 [37:23<1:38:06,  5.05it/s, training_loss=0.255]
Epoch 1:  28%|██▊       | 11503/41242 [37:23<1:38:06,  5.05it/s, training_loss=0.469]
Epoch 1:  28%|██▊       | 11504/41242 [37:23<1:37:13,  5.10it/s, training_loss=0.469]
Epoch 1:  28%|██▊       | 11504/41242 [37:24<1:37:13,  5.10it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11505/41242 [37:24<1:36:31,  5.13it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11505/41242 [37:24<1:36:31,  5.13it/s, training_loss=0.468]
Epoch 1:  28%|██▊       | 11506/41242 [37:24<1:36:01,  5.16it/s, training_loss=0.468]
Epoch 1:  28%|██▊       | 11506/41242 [37:24<1:36:01,  5.16it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11507/41242 [37:24<1:35:20,  5.20it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11507/41242 [37:24<1:35:20,  5.20it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11508/41242 [37:24<1:34:20,  5.25it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11508/41242 [37:24<1:34:20,  5.25it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11509/41242 [37:24<1:34:28,  5.25it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11509/41242 [37:24<1:34:28,  5.25it/s, training_loss=0.016]
Epoch 1:  28%|██▊       | 11510/41242 [37:24<1:34:39,  5.23it/s, training_loss=0.016]
Epoch 1:  28%|██▊       | 11510/41242 [37:25<1:34:39,  5.23it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11511/41242 [37:25<1:34:46,  5.23it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11511/41242 [37:25<1:34:46,  5.23it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11512/41242 [37:25<1:33:51,  5.28it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11512/41242 [37:25<1:33:51,  5.28it/s, training_loss=0.099]
Epoch 1:  28%|██▊       | 11513/41242 [37:25<1:34:20,  5.25it/s, training_loss=0.099]
Epoch 1:  28%|██▊       | 11513/41242 [37:25<1:34:20,  5.25it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11514/41242 [37:25<1:33:31,  5.30it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11514/41242 [37:25<1:33:31,  5.30it/s, training_loss=0.222]
Epoch 1:  28%|██▊       | 11515/41242 [37:25<1:33:23,  5.31it/s, training_loss=0.222]
Epoch 1:  28%|██▊       | 11515/41242 [37:26<1:33:23,  5.31it/s, training_loss=0.372]
Epoch 1:  28%|██▊       | 11516/41242 [37:26<1:34:18,  5.25it/s, training_loss=0.372]
Epoch 1:  28%|██▊       | 11516/41242 [37:26<1:34:18,  5.25it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11517/41242 [37:26<1:33:26,  5.30it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11517/41242 [37:26<1:33:26,  5.30it/s, training_loss=0.609]
Epoch 1:  28%|██▊       | 11518/41242 [37:26<1:33:23,  5.30it/s, training_loss=0.609]
Epoch 1:  28%|██▊       | 11518/41242 [37:26<1:33:23,  5.30it/s, training_loss=0.064]
Epoch 1:  28%|██▊       | 11519/41242 [37:26<1:33:41,  5.29it/s, training_loss=0.064]
Epoch 1:  28%|██▊       | 11519/41242 [37:26<1:33:41,  5.29it/s, training_loss=0.028]
Epoch 1:  28%|██▊       | 11520/41242 [37:26<1:34:14,  5.26it/s, training_loss=0.028]
Epoch 1:  28%|██▊       | 11520/41242 [37:27<1:34:14,  5.26it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11521/41242 [37:27<1:34:46,  5.23it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11521/41242 [37:27<1:34:46,  5.23it/s, training_loss=0.731]
Epoch 1:  28%|██▊       | 11522/41242 [37:27<1:34:41,  5.23it/s, training_loss=0.731]
Epoch 1:  28%|██▊       | 11522/41242 [37:27<1:34:41,  5.23it/s, training_loss=0.015]
Epoch 1:  28%|██▊       | 11523/41242 [37:27<1:34:21,  5.25it/s, training_loss=0.015]
Epoch 1:  28%|██▊       | 11523/41242 [37:27<1:34:21,  5.25it/s, training_loss=0.085]
Epoch 1:  28%|██▊       | 11524/41242 [37:27<1:34:22,  5.25it/s, training_loss=0.085]
Epoch 1:  28%|██▊       | 11524/41242 [37:27<1:34:22,  5.25it/s, training_loss=0.070]
Epoch 1:  28%|██▊       | 11525/41242 [37:27<1:36:02,  5.16it/s, training_loss=0.070]
Epoch 1:  28%|██▊       | 11525/41242 [37:28<1:36:02,  5.16it/s, training_loss=0.044]
Epoch 1:  28%|██▊       | 11526/41242 [37:28<1:40:00,  4.95it/s, training_loss=0.044]
Epoch 1:  28%|██▊       | 11526/41242 [37:28<1:40:00,  4.95it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11527/41242 [37:28<1:40:09,  4.94it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11527/41242 [37:28<1:40:09,  4.94it/s, training_loss=0.052]
Epoch 1:  28%|██▊       | 11528/41242 [37:28<1:38:12,  5.04it/s, training_loss=0.052]
Epoch 1:  28%|██▊       | 11528/41242 [37:28<1:38:12,  5.04it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11529/41242 [37:28<1:36:22,  5.14it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11529/41242 [37:28<1:36:22,  5.14it/s, training_loss=0.304]
Epoch 1:  28%|██▊       | 11530/41242 [37:28<1:35:38,  5.18it/s, training_loss=0.304]
Epoch 1:  28%|██▊       | 11530/41242 [37:29<1:35:38,  5.18it/s, training_loss=0.034]
Epoch 1:  28%|██▊       | 11531/41242 [37:29<1:35:26,  5.19it/s, training_loss=0.034]
Epoch 1:  28%|██▊       | 11531/41242 [37:29<1:35:26,  5.19it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11532/41242 [37:29<1:34:21,  5.25it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11532/41242 [37:29<1:34:21,  5.25it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11533/41242 [37:29<1:36:15,  5.14it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11533/41242 [37:29<1:36:15,  5.14it/s, training_loss=0.480]
Epoch 1:  28%|██▊       | 11534/41242 [37:29<1:36:51,  5.11it/s, training_loss=0.480]
Epoch 1:  28%|██▊       | 11534/41242 [37:29<1:36:51,  5.11it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11535/41242 [37:29<1:35:56,  5.16it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11535/41242 [37:30<1:35:56,  5.16it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11536/41242 [37:30<1:36:00,  5.16it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11536/41242 [37:30<1:36:00,  5.16it/s, training_loss=0.975]
Epoch 1:  28%|██▊       | 11537/41242 [37:30<1:35:21,  5.19it/s, training_loss=0.975]
Epoch 1:  28%|██▊       | 11537/41242 [37:30<1:35:21,  5.19it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11538/41242 [37:30<1:34:44,  5.23it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11538/41242 [37:30<1:34:44,  5.23it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11539/41242 [37:30<1:34:03,  5.26it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11539/41242 [37:30<1:34:03,  5.26it/s, training_loss=0.096]
Epoch 1:  28%|██▊       | 11540/41242 [37:30<1:34:18,  5.25it/s, training_loss=0.096]
Epoch 1:  28%|██▊       | 11540/41242 [37:30<1:34:18,  5.25it/s, training_loss=0.060]
Epoch 1:  28%|██▊       | 11541/41242 [37:30<1:34:50,  5.22it/s, training_loss=0.060]
Epoch 1:  28%|██▊       | 11541/41242 [37:31<1:34:50,  5.22it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11542/41242 [37:31<1:33:56,  5.27it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11542/41242 [37:31<1:33:56,  5.27it/s, training_loss=0.516]
Epoch 1:  28%|██▊       | 11543/41242 [37:31<1:34:01,  5.26it/s, training_loss=0.516]
Epoch 1:  28%|██▊       | 11543/41242 [37:31<1:34:01,  5.26it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11544/41242 [37:31<1:34:45,  5.22it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11544/41242 [37:31<1:34:45,  5.22it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11545/41242 [37:31<1:34:31,  5.24it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11545/41242 [37:31<1:34:31,  5.24it/s, training_loss=0.062]
Epoch 1:  28%|██▊       | 11546/41242 [37:31<1:35:13,  5.20it/s, training_loss=0.062]
Epoch 1:  28%|██▊       | 11546/41242 [37:32<1:35:13,  5.20it/s, training_loss=0.078]
Epoch 1:  28%|██▊       | 11547/41242 [37:32<1:34:37,  5.23it/s, training_loss=0.078]
Epoch 1:  28%|██▊       | 11547/41242 [37:32<1:34:37,  5.23it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11548/41242 [37:32<1:33:40,  5.28it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11548/41242 [37:32<1:33:40,  5.28it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11549/41242 [37:32<1:33:39,  5.28it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11549/41242 [37:32<1:33:39,  5.28it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11550/41242 [37:32<1:33:10,  5.31it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11550/41242 [37:32<1:33:10,  5.31it/s, training_loss=0.012]
Epoch 1:  28%|██▊       | 11551/41242 [37:32<1:35:27,  5.18it/s, training_loss=0.012]
Epoch 1:  28%|██▊       | 11551/41242 [37:33<1:35:27,  5.18it/s, training_loss=0.047]
Epoch 1:  28%|██▊       | 11552/41242 [37:33<1:36:09,  5.15it/s, training_loss=0.047]
Epoch 1:  28%|██▊       | 11552/41242 [37:33<1:36:09,  5.15it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11553/41242 [37:33<1:35:51,  5.16it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11553/41242 [37:33<1:35:51,  5.16it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11554/41242 [37:33<1:36:24,  5.13it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11554/41242 [37:33<1:36:24,  5.13it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11555/41242 [37:33<1:35:47,  5.17it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11555/41242 [37:33<1:35:47,  5.17it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11556/41242 [37:33<1:35:33,  5.18it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11556/41242 [37:34<1:35:33,  5.18it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11557/41242 [37:34<1:34:26,  5.24it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11557/41242 [37:34<1:34:26,  5.24it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11558/41242 [37:34<1:33:21,  5.30it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11558/41242 [37:34<1:33:21,  5.30it/s, training_loss=0.018]
Epoch 1:  28%|██▊       | 11559/41242 [37:34<1:34:29,  5.24it/s, training_loss=0.018]
Epoch 1:  28%|██▊       | 11559/41242 [37:34<1:34:29,  5.24it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11560/41242 [37:34<1:33:52,  5.27it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11560/41242 [37:34<1:33:52,  5.27it/s, training_loss=0.023]
Epoch 1:  28%|██▊       | 11561/41242 [37:34<1:33:49,  5.27it/s, training_loss=0.023]
Epoch 1:  28%|██▊       | 11561/41242 [37:34<1:33:49,  5.27it/s, training_loss=0.012]
Epoch 1:  28%|██▊       | 11562/41242 [37:34<1:33:54,  5.27it/s, training_loss=0.012]
Epoch 1:  28%|██▊       | 11562/41242 [37:35<1:33:54,  5.27it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11563/41242 [37:35<1:33:39,  5.28it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11563/41242 [37:35<1:33:39,  5.28it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11564/41242 [37:35<1:33:16,  5.30it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11564/41242 [37:35<1:33:16,  5.30it/s, training_loss=0.031]
Epoch 1:  28%|██▊       | 11565/41242 [37:35<1:33:56,  5.27it/s, training_loss=0.031]
Epoch 1:  28%|██▊       | 11565/41242 [37:35<1:33:56,  5.27it/s, training_loss=0.040]
Epoch 1:  28%|██▊       | 11566/41242 [37:35<1:34:01,  5.26it/s, training_loss=0.040]
Epoch 1:  28%|██▊       | 11566/41242 [37:35<1:34:01,  5.26it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11567/41242 [37:35<1:33:28,  5.29it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11567/41242 [37:36<1:33:28,  5.29it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11568/41242 [37:36<1:33:01,  5.32it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11568/41242 [37:36<1:33:01,  5.32it/s, training_loss=0.979]
Epoch 1:  28%|██▊       | 11569/41242 [37:36<1:33:06,  5.31it/s, training_loss=0.979]
Epoch 1:  28%|██▊       | 11569/41242 [37:36<1:33:06,  5.31it/s, training_loss=0.661]
Epoch 1:  28%|██▊       | 11570/41242 [37:36<1:33:22,  5.30it/s, training_loss=0.661]
Epoch 1:  28%|██▊       | 11570/41242 [37:36<1:33:22,  5.30it/s, training_loss=0.419]
Epoch 1:  28%|██▊       | 11571/41242 [37:36<1:32:57,  5.32it/s, training_loss=0.419]
Epoch 1:  28%|██▊       | 11571/41242 [37:36<1:32:57,  5.32it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11572/41242 [37:36<1:34:54,  5.21it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11572/41242 [37:37<1:34:54,  5.21it/s, training_loss=0.290]
Epoch 1:  28%|██▊       | 11573/41242 [37:37<1:36:38,  5.12it/s, training_loss=0.290]
Epoch 1:  28%|██▊       | 11573/41242 [37:37<1:36:38,  5.12it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11574/41242 [37:37<1:36:00,  5.15it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11574/41242 [37:37<1:36:00,  5.15it/s, training_loss=0.057]
Epoch 1:  28%|██▊       | 11575/41242 [37:37<1:35:35,  5.17it/s, training_loss=0.057]
Epoch 1:  28%|██▊       | 11575/41242 [37:37<1:35:35,  5.17it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11576/41242 [37:37<1:34:49,  5.21it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11576/41242 [37:37<1:34:49,  5.21it/s, training_loss=0.063]
Epoch 1:  28%|██▊       | 11577/41242 [37:37<1:35:46,  5.16it/s, training_loss=0.063]
Epoch 1:  28%|██▊       | 11577/41242 [37:38<1:35:46,  5.16it/s, training_loss=0.074]
Epoch 1:  28%|██▊       | 11578/41242 [37:38<1:35:32,  5.17it/s, training_loss=0.074]
Epoch 1:  28%|██▊       | 11578/41242 [37:38<1:35:32,  5.17it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11579/41242 [37:38<1:35:11,  5.19it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11579/41242 [37:38<1:35:11,  5.19it/s, training_loss=0.182]
Epoch 1:  28%|██▊       | 11580/41242 [37:38<1:34:29,  5.23it/s, training_loss=0.182]
Epoch 1:  28%|██▊       | 11580/41242 [37:38<1:34:29,  5.23it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11581/41242 [37:38<1:34:10,  5.25it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11581/41242 [37:38<1:34:10,  5.25it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11582/41242 [37:38<1:34:00,  5.26it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11582/41242 [37:38<1:34:00,  5.26it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11583/41242 [37:38<1:33:08,  5.31it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11583/41242 [37:39<1:33:08,  5.31it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11584/41242 [37:39<1:33:02,  5.31it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11584/41242 [37:39<1:33:02,  5.31it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11585/41242 [37:39<1:34:17,  5.24it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11585/41242 [37:39<1:34:17,  5.24it/s, training_loss=0.257]
Epoch 1:  28%|██▊       | 11586/41242 [37:39<1:35:36,  5.17it/s, training_loss=0.257]
Epoch 1:  28%|██▊       | 11586/41242 [37:39<1:35:36,  5.17it/s, training_loss=0.044]
Epoch 1:  28%|██▊       | 11587/41242 [37:39<1:35:56,  5.15it/s, training_loss=0.044]
Epoch 1:  28%|██▊       | 11587/41242 [37:39<1:35:56,  5.15it/s, training_loss=0.022]
Epoch 1:  28%|██▊       | 11588/41242 [37:39<1:36:16,  5.13it/s, training_loss=0.022]
Epoch 1:  28%|██▊       | 11588/41242 [37:40<1:36:16,  5.13it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11589/41242 [37:40<1:36:25,  5.13it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11589/41242 [37:40<1:36:25,  5.13it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11590/41242 [37:40<1:35:19,  5.18it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11590/41242 [37:40<1:35:19,  5.18it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11591/41242 [37:40<1:35:03,  5.20it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11591/41242 [37:40<1:35:03,  5.20it/s, training_loss=0.013]
Epoch 1:  28%|██▊       | 11592/41242 [37:40<1:34:56,  5.21it/s, training_loss=0.013]
Epoch 1:  28%|██▊       | 11592/41242 [37:40<1:34:56,  5.21it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11593/41242 [37:40<1:34:23,  5.23it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11593/41242 [37:41<1:34:23,  5.23it/s, training_loss=0.077]
Epoch 1:  28%|██▊       | 11594/41242 [37:41<1:34:25,  5.23it/s, training_loss=0.077]
Epoch 1:  28%|██▊       | 11594/41242 [37:41<1:34:25,  5.23it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11595/41242 [37:41<1:36:14,  5.13it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11595/41242 [37:41<1:36:14,  5.13it/s, training_loss=0.048]
Epoch 1:  28%|██▊       | 11596/41242 [37:41<1:35:46,  5.16it/s, training_loss=0.048]
Epoch 1:  28%|██▊       | 11596/41242 [37:41<1:35:46,  5.16it/s, training_loss=0.521]
Epoch 1:  28%|██▊       | 11597/41242 [37:41<1:34:43,  5.22it/s, training_loss=0.521]
Epoch 1:  28%|██▊       | 11597/41242 [37:41<1:34:43,  5.22it/s, training_loss=0.015]
Epoch 1:  28%|██▊       | 11598/41242 [37:41<1:34:04,  5.25it/s, training_loss=0.015]
Epoch 1:  28%|██▊       | 11598/41242 [37:42<1:34:04,  5.25it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11599/41242 [37:42<1:34:48,  5.21it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11599/41242 [37:42<1:34:48,  5.21it/s, training_loss=0.041]
Epoch 1:  28%|██▊       | 11600/41242 [37:42<1:35:08,  5.19it/s, training_loss=0.041]
Epoch 1:  28%|██▊       | 11600/41242 [37:42<1:35:08,  5.19it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11601/41242 [37:42<1:36:13,  5.13it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11601/41242 [37:42<1:36:13,  5.13it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11602/41242 [37:42<1:35:57,  5.15it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11602/41242 [37:42<1:35:57,  5.15it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11603/41242 [37:42<1:35:33,  5.17it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11603/41242 [37:43<1:35:33,  5.17it/s, training_loss=0.040]
Epoch 1:  28%|██▊       | 11604/41242 [37:43<1:35:39,  5.16it/s, training_loss=0.040]
Epoch 1:  28%|██▊       | 11604/41242 [37:43<1:35:39,  5.16it/s, training_loss=0.032]
Epoch 1:  28%|██▊       | 11605/41242 [37:43<1:36:40,  5.11it/s, training_loss=0.032]
Epoch 1:  28%|██▊       | 11605/41242 [37:43<1:36:40,  5.11it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11606/41242 [37:43<1:36:14,  5.13it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11606/41242 [37:43<1:36:14,  5.13it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11607/41242 [37:43<1:37:34,  5.06it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11607/41242 [37:43<1:37:34,  5.06it/s, training_loss=0.064]
Epoch 1:  28%|██▊       | 11608/41242 [37:43<1:37:44,  5.05it/s, training_loss=0.064]
Epoch 1:  28%|██▊       | 11608/41242 [37:44<1:37:44,  5.05it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11609/41242 [37:44<1:36:49,  5.10it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11609/41242 [37:44<1:36:49,  5.10it/s, training_loss=0.374]
Epoch 1:  28%|██▊       | 11610/41242 [37:44<1:37:40,  5.06it/s, training_loss=0.374]
Epoch 1:  28%|██▊       | 11610/41242 [37:44<1:37:40,  5.06it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11611/41242 [37:44<1:37:53,  5.04it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11611/41242 [37:44<1:37:53,  5.04it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11612/41242 [37:44<1:36:41,  5.11it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11612/41242 [37:44<1:36:41,  5.11it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11613/41242 [37:44<1:35:18,  5.18it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11613/41242 [37:44<1:35:18,  5.18it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11614/41242 [37:44<1:35:17,  5.18it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11614/41242 [37:45<1:35:17,  5.18it/s, training_loss=0.046]
Epoch 1:  28%|██▊       | 11615/41242 [37:45<1:34:55,  5.20it/s, training_loss=0.046]
Epoch 1:  28%|██▊       | 11615/41242 [37:45<1:34:55,  5.20it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11616/41242 [37:45<1:34:40,  5.22it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11616/41242 [37:45<1:34:40,  5.22it/s, training_loss=0.045]
Epoch 1:  28%|██▊       | 11617/41242 [37:45<1:34:59,  5.20it/s, training_loss=0.045]
Epoch 1:  28%|██▊       | 11617/41242 [37:45<1:34:59,  5.20it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11618/41242 [37:45<1:34:40,  5.21it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11618/41242 [37:45<1:34:40,  5.21it/s, training_loss=0.244]
Epoch 1:  28%|██▊       | 11619/41242 [37:45<1:36:46,  5.10it/s, training_loss=0.244]
Epoch 1:  28%|██▊       | 11619/41242 [37:46<1:36:46,  5.10it/s, training_loss=0.261]
Epoch 1:  28%|██▊       | 11620/41242 [37:46<1:38:15,  5.02it/s, training_loss=0.261]
Epoch 1:  28%|██▊       | 11620/41242 [37:46<1:38:15,  5.02it/s, training_loss=0.039]
Epoch 1:  28%|██▊       | 11621/41242 [37:46<1:36:51,  5.10it/s, training_loss=0.039]
Epoch 1:  28%|██▊       | 11621/41242 [37:46<1:36:51,  5.10it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11622/41242 [37:46<1:37:26,  5.07it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11622/41242 [37:46<1:37:26,  5.07it/s, training_loss=0.435]
Epoch 1:  28%|██▊       | 11623/41242 [37:46<1:38:12,  5.03it/s, training_loss=0.435]
Epoch 1:  28%|██▊       | 11623/41242 [37:46<1:38:12,  5.03it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11624/41242 [37:46<1:37:33,  5.06it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11624/41242 [37:47<1:37:33,  5.06it/s, training_loss=0.200]
Epoch 1:  28%|██▊       | 11625/41242 [37:47<1:37:43,  5.05it/s, training_loss=0.200]
Epoch 1:  28%|██▊       | 11625/41242 [37:47<1:37:43,  5.05it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11626/41242 [37:47<1:37:23,  5.07it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11626/41242 [37:47<1:37:23,  5.07it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11627/41242 [37:47<1:37:49,  5.05it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11627/41242 [37:47<1:37:49,  5.05it/s, training_loss=0.024]
Epoch 1:  28%|██▊       | 11628/41242 [37:47<1:37:16,  5.07it/s, training_loss=0.024]
Epoch 1:  28%|██▊       | 11628/41242 [37:47<1:37:16,  5.07it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11629/41242 [37:47<1:35:42,  5.16it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11629/41242 [37:48<1:35:42,  5.16it/s, training_loss=0.718]
Epoch 1:  28%|██▊       | 11630/41242 [37:48<1:35:09,  5.19it/s, training_loss=0.718]
Epoch 1:  28%|██▊       | 11630/41242 [37:48<1:35:09,  5.19it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11631/41242 [37:48<1:34:10,  5.24it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11631/41242 [37:48<1:34:10,  5.24it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11632/41242 [37:48<1:33:53,  5.26it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11632/41242 [37:48<1:33:53,  5.26it/s, training_loss=0.013]
Epoch 1:  28%|██▊       | 11633/41242 [37:48<1:34:01,  5.25it/s, training_loss=0.013]
Epoch 1:  28%|██▊       | 11633/41242 [37:48<1:34:01,  5.25it/s, training_loss=0.104]
Epoch 1:  28%|██▊       | 11634/41242 [37:48<1:34:14,  5.24it/s, training_loss=0.104]
Epoch 1:  28%|██▊       | 11634/41242 [37:49<1:34:14,  5.24it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11635/41242 [37:49<1:34:37,  5.21it/s, training_loss=0.010]
Epoch 1:  28%|██▊       | 11635/41242 [37:49<1:34:37,  5.21it/s, training_loss=0.021]
Epoch 1:  28%|██▊       | 11636/41242 [37:49<1:35:42,  5.16it/s, training_loss=0.021]
Epoch 1:  28%|██▊       | 11636/41242 [37:49<1:35:42,  5.16it/s, training_loss=0.942]
Epoch 1:  28%|██▊       | 11637/41242 [37:49<1:35:57,  5.14it/s, training_loss=0.942]
Epoch 1:  28%|██▊       | 11637/41242 [37:49<1:35:57,  5.14it/s, training_loss=0.136]
Epoch 1:  28%|██▊       | 11638/41242 [37:49<1:35:51,  5.15it/s, training_loss=0.136]
Epoch 1:  28%|██▊       | 11638/41242 [37:49<1:35:51,  5.15it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11639/41242 [37:49<1:36:52,  5.09it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11639/41242 [37:50<1:36:52,  5.09it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11640/41242 [37:50<1:36:16,  5.12it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11640/41242 [37:50<1:36:16,  5.12it/s, training_loss=0.021]
Epoch 1:  28%|██▊       | 11641/41242 [37:50<1:37:05,  5.08it/s, training_loss=0.021]
Epoch 1:  28%|██▊       | 11641/41242 [37:50<1:37:05,  5.08it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11642/41242 [37:50<1:36:28,  5.11it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11642/41242 [37:50<1:36:28,  5.11it/s, training_loss=0.343]
Epoch 1:  28%|██▊       | 11643/41242 [37:50<1:37:43,  5.05it/s, training_loss=0.343]
Epoch 1:  28%|██▊       | 11643/41242 [37:50<1:37:43,  5.05it/s, training_loss=0.012]
Epoch 1:  28%|██▊       | 11644/41242 [37:50<1:37:55,  5.04it/s, training_loss=0.012]
Epoch 1:  28%|██▊       | 11644/41242 [37:51<1:37:55,  5.04it/s, training_loss=0.259]
Epoch 1:  28%|██▊       | 11645/41242 [37:51<1:37:06,  5.08it/s, training_loss=0.259]
Epoch 1:  28%|██▊       | 11645/41242 [37:51<1:37:06,  5.08it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11646/41242 [37:51<1:38:33,  5.00it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11646/41242 [37:51<1:38:33,  5.00it/s, training_loss=0.339]
Epoch 1:  28%|██▊       | 11647/41242 [37:51<1:39:51,  4.94it/s, training_loss=0.339]
Epoch 1:  28%|██▊       | 11647/41242 [37:51<1:39:51,  4.94it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11648/41242 [37:51<1:39:29,  4.96it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11648/41242 [37:51<1:39:29,  4.96it/s, training_loss=0.012]
Epoch 1:  28%|██▊       | 11649/41242 [37:51<1:38:57,  4.98it/s, training_loss=0.012]
Epoch 1:  28%|██▊       | 11649/41242 [37:52<1:38:57,  4.98it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11650/41242 [37:52<1:37:50,  5.04it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11650/41242 [37:52<1:37:50,  5.04it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11651/41242 [37:52<1:36:51,  5.09it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11651/41242 [37:52<1:36:51,  5.09it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11652/41242 [37:52<1:35:24,  5.17it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11652/41242 [37:52<1:35:24,  5.17it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11653/41242 [37:52<1:34:30,  5.22it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11653/41242 [37:52<1:34:30,  5.22it/s, training_loss=0.019]
Epoch 1:  28%|██▊       | 11654/41242 [37:52<1:33:50,  5.25it/s, training_loss=0.019]
Epoch 1:  28%|██▊       | 11654/41242 [37:52<1:33:50,  5.25it/s, training_loss=0.316]
Epoch 1:  28%|██▊       | 11655/41242 [37:52<1:33:50,  5.25it/s, training_loss=0.316]
Epoch 1:  28%|██▊       | 11655/41242 [37:53<1:33:50,  5.25it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11656/41242 [37:53<1:33:07,  5.30it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11656/41242 [37:53<1:33:07,  5.30it/s, training_loss=0.032]
Epoch 1:  28%|██▊       | 11657/41242 [37:53<1:33:49,  5.26it/s, training_loss=0.032]
Epoch 1:  28%|██▊       | 11657/41242 [37:53<1:33:49,  5.26it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11658/41242 [37:53<1:33:04,  5.30it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11658/41242 [37:53<1:33:04,  5.30it/s, training_loss=0.019]
Epoch 1:  28%|██▊       | 11659/41242 [37:53<1:32:52,  5.31it/s, training_loss=0.019]
Epoch 1:  28%|██▊       | 11659/41242 [37:53<1:32:52,  5.31it/s, training_loss=0.649]
Epoch 1:  28%|██▊       | 11660/41242 [37:53<1:33:26,  5.28it/s, training_loss=0.649]
Epoch 1:  28%|██▊       | 11660/41242 [37:54<1:33:26,  5.28it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11661/41242 [37:54<1:33:47,  5.26it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11661/41242 [37:54<1:33:47,  5.26it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11662/41242 [37:54<1:33:35,  5.27it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11662/41242 [37:54<1:33:35,  5.27it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11663/41242 [37:54<1:33:00,  5.30it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11663/41242 [37:54<1:33:00,  5.30it/s, training_loss=0.163]
Epoch 1:  28%|██▊       | 11664/41242 [37:54<1:32:49,  5.31it/s, training_loss=0.163]
Epoch 1:  28%|██▊       | 11664/41242 [37:54<1:32:49,  5.31it/s, training_loss=0.094]
Epoch 1:  28%|██▊       | 11665/41242 [37:54<1:33:06,  5.29it/s, training_loss=0.094]
Epoch 1:  28%|██▊       | 11665/41242 [37:55<1:33:06,  5.29it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11666/41242 [37:55<1:32:56,  5.30it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11666/41242 [37:55<1:32:56,  5.30it/s, training_loss=0.447]
Epoch 1:  28%|██▊       | 11667/41242 [37:55<1:33:25,  5.28it/s, training_loss=0.447]
Epoch 1:  28%|██▊       | 11667/41242 [37:55<1:33:25,  5.28it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11668/41242 [37:55<1:34:51,  5.20it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11668/41242 [37:55<1:34:51,  5.20it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11669/41242 [37:55<1:34:31,  5.21it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11669/41242 [37:55<1:34:31,  5.21it/s, training_loss=1.224]
Epoch 1:  28%|██▊       | 11670/41242 [37:55<1:35:47,  5.15it/s, training_loss=1.224]
Epoch 1:  28%|██▊       | 11670/41242 [37:56<1:35:47,  5.15it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11671/41242 [37:56<1:37:28,  5.06it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11671/41242 [37:56<1:37:28,  5.06it/s, training_loss=0.410]
Epoch 1:  28%|██▊       | 11672/41242 [37:56<1:37:56,  5.03it/s, training_loss=0.410]
Epoch 1:  28%|██▊       | 11672/41242 [37:56<1:37:56,  5.03it/s, training_loss=0.040]
Epoch 1:  28%|██▊       | 11673/41242 [37:56<1:38:37,  5.00it/s, training_loss=0.040]
Epoch 1:  28%|██▊       | 11673/41242 [37:56<1:38:37,  5.00it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11674/41242 [37:56<1:37:22,  5.06it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11674/41242 [37:56<1:37:22,  5.06it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11675/41242 [37:56<1:36:02,  5.13it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11675/41242 [37:57<1:36:02,  5.13it/s, training_loss=0.037]
Epoch 1:  28%|██▊       | 11676/41242 [37:57<1:36:05,  5.13it/s, training_loss=0.037]
Epoch 1:  28%|██▊       | 11676/41242 [37:57<1:36:05,  5.13it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11677/41242 [37:57<1:34:39,  5.21it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11677/41242 [37:57<1:34:39,  5.21it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11678/41242 [37:57<1:33:47,  5.25it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11678/41242 [37:57<1:33:47,  5.25it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11679/41242 [37:57<1:33:37,  5.26it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11679/41242 [37:57<1:33:37,  5.26it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11680/41242 [37:57<1:33:31,  5.27it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11680/41242 [37:57<1:33:31,  5.27it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11681/41242 [37:57<1:33:28,  5.27it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11681/41242 [37:58<1:33:28,  5.27it/s, training_loss=0.602]
Epoch 1:  28%|██▊       | 11682/41242 [37:58<1:35:47,  5.14it/s, training_loss=0.602]
Epoch 1:  28%|██▊       | 11682/41242 [37:58<1:35:47,  5.14it/s, training_loss=0.073]
Epoch 1:  28%|██▊       | 11683/41242 [37:58<1:35:41,  5.15it/s, training_loss=0.073]
Epoch 1:  28%|██▊       | 11683/41242 [37:58<1:35:41,  5.15it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11684/41242 [37:58<1:35:23,  5.16it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11684/41242 [37:58<1:35:23,  5.16it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11685/41242 [37:58<1:34:53,  5.19it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11685/41242 [37:58<1:34:53,  5.19it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11686/41242 [37:58<1:34:31,  5.21it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11686/41242 [37:59<1:34:31,  5.21it/s, training_loss=0.080]
Epoch 1:  28%|██▊       | 11687/41242 [37:59<1:34:14,  5.23it/s, training_loss=0.080]
Epoch 1:  28%|██▊       | 11687/41242 [37:59<1:34:14,  5.23it/s, training_loss=0.555]
Epoch 1:  28%|██▊       | 11688/41242 [37:59<1:34:15,  5.23it/s, training_loss=0.555]
Epoch 1:  28%|██▊       | 11688/41242 [37:59<1:34:15,  5.23it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11689/41242 [37:59<1:33:17,  5.28it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11689/41242 [37:59<1:33:17,  5.28it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11690/41242 [37:59<1:34:31,  5.21it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11690/41242 [37:59<1:34:31,  5.21it/s, training_loss=0.029]
Epoch 1:  28%|██▊       | 11691/41242 [37:59<1:34:17,  5.22it/s, training_loss=0.029]
Epoch 1:  28%|██▊       | 11691/41242 [38:00<1:34:17,  5.22it/s, training_loss=0.154]
Epoch 1:  28%|██▊       | 11692/41242 [38:00<1:36:22,  5.11it/s, training_loss=0.154]
Epoch 1:  28%|██▊       | 11692/41242 [38:00<1:36:22,  5.11it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11693/41242 [38:00<1:36:37,  5.10it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11693/41242 [38:00<1:36:37,  5.10it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11694/41242 [38:00<1:35:32,  5.15it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11694/41242 [38:00<1:35:32,  5.15it/s, training_loss=0.048]
Epoch 1:  28%|██▊       | 11695/41242 [38:00<1:35:36,  5.15it/s, training_loss=0.048]
Epoch 1:  28%|██▊       | 11695/41242 [38:00<1:35:36,  5.15it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11696/41242 [38:00<1:34:22,  5.22it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11696/41242 [38:01<1:34:22,  5.22it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11697/41242 [38:01<1:32:57,  5.30it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11697/41242 [38:01<1:32:57,  5.30it/s, training_loss=0.047]
Epoch 1:  28%|██▊       | 11698/41242 [38:01<1:32:56,  5.30it/s, training_loss=0.047]
Epoch 1:  28%|██▊       | 11698/41242 [38:01<1:32:56,  5.30it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11699/41242 [38:01<1:32:51,  5.30it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11699/41242 [38:01<1:32:51,  5.30it/s, training_loss=0.028]
Epoch 1:  28%|██▊       | 11700/41242 [38:01<1:33:24,  5.27it/s, training_loss=0.028]
Epoch 1:  28%|██▊       | 11700/41242 [38:01<1:33:24,  5.27it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11701/41242 [38:01<1:33:35,  5.26it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11701/41242 [38:02<1:33:35,  5.26it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11702/41242 [38:02<1:33:40,  5.26it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11702/41242 [38:02<1:33:40,  5.26it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11703/41242 [38:02<1:34:32,  5.21it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11703/41242 [38:02<1:34:32,  5.21it/s, training_loss=0.037]
Epoch 1:  28%|██▊       | 11704/41242 [38:02<1:35:19,  5.16it/s, training_loss=0.037]
Epoch 1:  28%|██▊       | 11704/41242 [38:02<1:35:19,  5.16it/s, training_loss=0.011]
Epoch 1:  28%|██▊       | 11705/41242 [38:02<1:36:13,  5.12it/s, training_loss=0.011]
Epoch 1:  28%|██▊       | 11705/41242 [38:02<1:36:13,  5.12it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11706/41242 [38:02<1:37:01,  5.07it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11706/41242 [38:02<1:37:01,  5.07it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11707/41242 [38:02<1:35:49,  5.14it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11707/41242 [38:03<1:35:49,  5.14it/s, training_loss=0.073]
Epoch 1:  28%|██▊       | 11708/41242 [38:03<1:35:13,  5.17it/s, training_loss=0.073]
Epoch 1:  28%|██▊       | 11708/41242 [38:03<1:35:13,  5.17it/s, training_loss=0.034]
Epoch 1:  28%|██▊       | 11709/41242 [38:03<1:34:25,  5.21it/s, training_loss=0.034]
Epoch 1:  28%|██▊       | 11709/41242 [38:03<1:34:25,  5.21it/s, training_loss=0.027]
Epoch 1:  28%|██▊       | 11710/41242 [38:03<1:34:31,  5.21it/s, training_loss=0.027]
Epoch 1:  28%|██▊       | 11710/41242 [38:03<1:34:31,  5.21it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11711/41242 [38:03<1:34:16,  5.22it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11711/41242 [38:03<1:34:16,  5.22it/s, training_loss=0.016]
Epoch 1:  28%|██▊       | 11712/41242 [38:03<1:35:28,  5.15it/s, training_loss=0.016]
Epoch 1:  28%|██▊       | 11712/41242 [38:04<1:35:28,  5.15it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11713/41242 [38:04<1:35:34,  5.15it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11713/41242 [38:04<1:35:34,  5.15it/s, training_loss=0.018]
Epoch 1:  28%|██▊       | 11714/41242 [38:04<1:37:12,  5.06it/s, training_loss=0.018]
Epoch 1:  28%|██▊       | 11714/41242 [38:04<1:37:12,  5.06it/s, training_loss=0.012]
Epoch 1:  28%|██▊       | 11715/41242 [38:04<1:36:41,  5.09it/s, training_loss=0.012]
Epoch 1:  28%|██▊       | 11715/41242 [38:04<1:36:41,  5.09it/s, training_loss=0.163]
Epoch 1:  28%|██▊       | 11716/41242 [38:04<1:37:35,  5.04it/s, training_loss=0.163]
Epoch 1:  28%|██▊       | 11716/41242 [38:04<1:37:35,  5.04it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11717/41242 [38:04<1:38:18,  5.01it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11717/41242 [38:05<1:38:18,  5.01it/s, training_loss=0.016]
Epoch 1:  28%|██▊       | 11718/41242 [38:05<1:38:07,  5.01it/s, training_loss=0.016]
Epoch 1:  28%|██▊       | 11718/41242 [38:05<1:38:07,  5.01it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11719/41242 [38:05<1:37:27,  5.05it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11719/41242 [38:05<1:37:27,  5.05it/s, training_loss=0.190]
Epoch 1:  28%|██▊       | 11720/41242 [38:05<1:36:27,  5.10it/s, training_loss=0.190]
Epoch 1:  28%|██▊       | 11720/41242 [38:05<1:36:27,  5.10it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11721/41242 [38:05<1:36:11,  5.11it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11721/41242 [38:05<1:36:11,  5.11it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11722/41242 [38:05<1:35:23,  5.16it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11722/41242 [38:06<1:35:23,  5.16it/s, training_loss=0.035]
Epoch 1:  28%|██▊       | 11723/41242 [38:06<1:36:05,  5.12it/s, training_loss=0.035]
Epoch 1:  28%|██▊       | 11723/41242 [38:06<1:36:05,  5.12it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11724/41242 [38:06<1:36:04,  5.12it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11724/41242 [38:06<1:36:04,  5.12it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11725/41242 [38:06<1:35:18,  5.16it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11725/41242 [38:06<1:35:18,  5.16it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11726/41242 [38:06<1:34:05,  5.23it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11726/41242 [38:06<1:34:05,  5.23it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11727/41242 [38:06<1:34:44,  5.19it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11727/41242 [38:07<1:34:44,  5.19it/s, training_loss=0.344]
Epoch 1:  28%|██▊       | 11728/41242 [38:07<1:36:56,  5.07it/s, training_loss=0.344]
Epoch 1:  28%|██▊       | 11728/41242 [38:07<1:36:56,  5.07it/s, training_loss=0.044]
Epoch 1:  28%|██▊       | 11729/41242 [38:07<1:36:20,  5.11it/s, training_loss=0.044]
Epoch 1:  28%|██▊       | 11729/41242 [38:07<1:36:20,  5.11it/s, training_loss=0.170]
Epoch 1:  28%|██▊       | 11730/41242 [38:07<1:35:58,  5.13it/s, training_loss=0.170]
Epoch 1:  28%|██▊       | 11730/41242 [38:07<1:35:58,  5.13it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11731/41242 [38:07<1:35:06,  5.17it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11731/41242 [38:07<1:35:06,  5.17it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11732/41242 [38:07<1:34:25,  5.21it/s, training_loss=0.003]
Epoch 1:  28%|██▊       | 11732/41242 [38:08<1:34:25,  5.21it/s, training_loss=0.211]
Epoch 1:  28%|██▊       | 11733/41242 [38:08<1:35:44,  5.14it/s, training_loss=0.211]
Epoch 1:  28%|██▊       | 11733/41242 [38:08<1:35:44,  5.14it/s, training_loss=0.472]
Epoch 1:  28%|██▊       | 11734/41242 [38:08<1:37:38,  5.04it/s, training_loss=0.472]
Epoch 1:  28%|██▊       | 11734/41242 [38:08<1:37:38,  5.04it/s, training_loss=0.018]
Epoch 1:  28%|██▊       | 11735/41242 [38:08<1:36:42,  5.08it/s, training_loss=0.018]
Epoch 1:  28%|██▊       | 11735/41242 [38:08<1:36:42,  5.08it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11736/41242 [38:08<1:35:53,  5.13it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11736/41242 [38:08<1:35:53,  5.13it/s, training_loss=0.079]
Epoch 1:  28%|██▊       | 11737/41242 [38:08<1:35:57,  5.12it/s, training_loss=0.079]
Epoch 1:  28%|██▊       | 11737/41242 [38:09<1:35:57,  5.12it/s, training_loss=0.045]
Epoch 1:  28%|██▊       | 11738/41242 [38:09<1:35:26,  5.15it/s, training_loss=0.045]
Epoch 1:  28%|██▊       | 11738/41242 [38:09<1:35:26,  5.15it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11739/41242 [38:09<1:34:58,  5.18it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11739/41242 [38:09<1:34:58,  5.18it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11740/41242 [38:09<1:35:29,  5.15it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11740/41242 [38:09<1:35:29,  5.15it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11741/41242 [38:09<1:36:57,  5.07it/s, training_loss=0.006]
Epoch 1:  28%|██▊       | 11741/41242 [38:09<1:36:57,  5.07it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11742/41242 [38:09<1:37:14,  5.06it/s, training_loss=0.002]
Epoch 1:  28%|██▊       | 11742/41242 [38:10<1:37:14,  5.06it/s, training_loss=0.012]
Epoch 1:  28%|██▊       | 11743/41242 [38:10<1:37:15,  5.06it/s, training_loss=0.012]
Epoch 1:  28%|██▊       | 11743/41242 [38:10<1:37:15,  5.06it/s, training_loss=1.052]
Epoch 1:  28%|██▊       | 11744/41242 [38:10<1:37:12,  5.06it/s, training_loss=1.052]
Epoch 1:  28%|██▊       | 11744/41242 [38:10<1:37:12,  5.06it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11745/41242 [38:10<1:37:19,  5.05it/s, training_loss=0.008]
Epoch 1:  28%|██▊       | 11745/41242 [38:10<1:37:19,  5.05it/s, training_loss=0.030]
Epoch 1:  28%|██▊       | 11746/41242 [38:10<1:37:56,  5.02it/s, training_loss=0.030]
Epoch 1:  28%|██▊       | 11746/41242 [38:10<1:37:56,  5.02it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11747/41242 [38:10<1:37:20,  5.05it/s, training_loss=0.005]
Epoch 1:  28%|██▊       | 11747/41242 [38:11<1:37:20,  5.05it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11748/41242 [38:11<1:36:49,  5.08it/s, training_loss=0.007]
Epoch 1:  28%|██▊       | 11748/41242 [38:11<1:36:49,  5.08it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11749/41242 [38:11<1:36:35,  5.09it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11749/41242 [38:11<1:36:35,  5.09it/s, training_loss=0.036]
Epoch 1:  28%|██▊       | 11750/41242 [38:11<1:36:39,  5.09it/s, training_loss=0.036]
Epoch 1:  28%|██▊       | 11750/41242 [38:11<1:36:39,  5.09it/s, training_loss=0.298]
Epoch 1:  28%|██▊       | 11751/41242 [38:11<1:35:30,  5.15it/s, training_loss=0.298]
Epoch 1:  28%|██▊       | 11751/41242 [38:11<1:35:30,  5.15it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11752/41242 [38:11<1:35:04,  5.17it/s, training_loss=0.009]
Epoch 1:  28%|██▊       | 11752/41242 [38:11<1:35:04,  5.17it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11753/41242 [38:11<1:33:55,  5.23it/s, training_loss=0.004]
Epoch 1:  28%|██▊       | 11753/41242 [38:12<1:33:55,  5.23it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11754/41242 [38:12<1:33:21,  5.26it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11754/41242 [38:12<1:33:21,  5.26it/s, training_loss=0.004]
Epoch 1:  29%|██▊       | 11755/41242 [38:12<1:34:27,  5.20it/s, training_loss=0.004]
Epoch 1:  29%|██▊       | 11755/41242 [38:12<1:34:27,  5.20it/s, training_loss=0.048]
Epoch 1:  29%|██▊       | 11756/41242 [38:12<1:35:37,  5.14it/s, training_loss=0.048]
Epoch 1:  29%|██▊       | 11756/41242 [38:12<1:35:37,  5.14it/s, training_loss=0.016]
Epoch 1:  29%|██▊       | 11757/41242 [38:12<1:35:18,  5.16it/s, training_loss=0.016]
Epoch 1:  29%|██▊       | 11757/41242 [38:12<1:35:18,  5.16it/s, training_loss=0.009]
Epoch 1:  29%|██▊       | 11758/41242 [38:12<1:35:52,  5.13it/s, training_loss=0.009]
Epoch 1:  29%|██▊       | 11758/41242 [38:13<1:35:52,  5.13it/s, training_loss=0.004]
Epoch 1:  29%|██▊       | 11759/41242 [38:13<1:34:33,  5.20it/s, training_loss=0.004]
Epoch 1:  29%|██▊       | 11759/41242 [38:13<1:34:33,  5.20it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11760/41242 [38:13<1:34:24,  5.20it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11760/41242 [38:13<1:34:24,  5.20it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11761/41242 [38:13<1:33:29,  5.26it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11761/41242 [38:13<1:33:29,  5.26it/s, training_loss=0.482]
Epoch 1:  29%|██▊       | 11762/41242 [38:13<1:33:25,  5.26it/s, training_loss=0.482]
Epoch 1:  29%|██▊       | 11762/41242 [38:13<1:33:25,  5.26it/s, training_loss=0.006]
Epoch 1:  29%|██▊       | 11763/41242 [38:13<1:33:11,  5.27it/s, training_loss=0.006]
Epoch 1:  29%|██▊       | 11763/41242 [38:14<1:33:11,  5.27it/s, training_loss=0.028]
Epoch 1:  29%|██▊       | 11764/41242 [38:14<1:33:50,  5.23it/s, training_loss=0.028]
Epoch 1:  29%|██▊       | 11764/41242 [38:14<1:33:50,  5.23it/s, training_loss=0.007]
Epoch 1:  29%|██▊       | 11765/41242 [38:14<1:34:00,  5.23it/s, training_loss=0.007]
Epoch 1:  29%|██▊       | 11765/41242 [38:14<1:34:00,  5.23it/s, training_loss=0.033]
Epoch 1:  29%|██▊       | 11766/41242 [38:14<1:35:13,  5.16it/s, training_loss=0.033]
Epoch 1:  29%|██▊       | 11766/41242 [38:14<1:35:13,  5.16it/s, training_loss=0.005]
Epoch 1:  29%|██▊       | 11767/41242 [38:14<1:35:04,  5.17it/s, training_loss=0.005]
Epoch 1:  29%|██▊       | 11767/41242 [38:14<1:35:04,  5.17it/s, training_loss=0.005]
Epoch 1:  29%|██▊       | 11768/41242 [38:14<1:33:50,  5.23it/s, training_loss=0.005]
Epoch 1:  29%|██▊       | 11768/41242 [38:15<1:33:50,  5.23it/s, training_loss=0.150]
Epoch 1:  29%|██▊       | 11769/41242 [38:15<1:33:33,  5.25it/s, training_loss=0.150]
Epoch 1:  29%|██▊       | 11769/41242 [38:15<1:33:33,  5.25it/s, training_loss=0.020]
Epoch 1:  29%|██▊       | 11770/41242 [38:15<1:33:41,  5.24it/s, training_loss=0.020]
Epoch 1:  29%|██▊       | 11770/41242 [38:15<1:33:41,  5.24it/s, training_loss=0.005]
Epoch 1:  29%|██▊       | 11771/41242 [38:15<1:32:45,  5.30it/s, training_loss=0.005]
Epoch 1:  29%|██▊       | 11771/41242 [38:15<1:32:45,  5.30it/s, training_loss=0.007]
Epoch 1:  29%|██▊       | 11772/41242 [38:15<1:32:02,  5.34it/s, training_loss=0.007]
Epoch 1:  29%|██▊       | 11772/41242 [38:15<1:32:02,  5.34it/s, training_loss=0.004]
Epoch 1:  29%|██▊       | 11773/41242 [38:15<1:33:15,  5.27it/s, training_loss=0.004]
Epoch 1:  29%|██▊       | 11773/41242 [38:15<1:33:15,  5.27it/s, training_loss=0.006]
Epoch 1:  29%|██▊       | 11774/41242 [38:15<1:33:45,  5.24it/s, training_loss=0.006]
Epoch 1:  29%|██▊       | 11774/41242 [38:16<1:33:45,  5.24it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11775/41242 [38:16<1:33:41,  5.24it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11775/41242 [38:16<1:33:41,  5.24it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11776/41242 [38:16<1:33:19,  5.26it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11776/41242 [38:16<1:33:19,  5.26it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11777/41242 [38:16<1:32:55,  5.28it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11777/41242 [38:16<1:32:55,  5.28it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11778/41242 [38:16<1:33:45,  5.24it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11778/41242 [38:16<1:33:45,  5.24it/s, training_loss=0.011]
Epoch 1:  29%|██▊       | 11779/41242 [38:16<1:33:56,  5.23it/s, training_loss=0.011]
Epoch 1:  29%|██▊       | 11779/41242 [38:17<1:33:56,  5.23it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11780/41242 [38:17<1:33:27,  5.25it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11780/41242 [38:17<1:33:27,  5.25it/s, training_loss=0.007]
Epoch 1:  29%|██▊       | 11781/41242 [38:17<1:33:20,  5.26it/s, training_loss=0.007]
Epoch 1:  29%|██▊       | 11781/41242 [38:17<1:33:20,  5.26it/s, training_loss=0.303]
Epoch 1:  29%|██▊       | 11782/41242 [38:17<1:33:03,  5.28it/s, training_loss=0.303]
Epoch 1:  29%|██▊       | 11782/41242 [38:17<1:33:03,  5.28it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11783/41242 [38:17<1:33:01,  5.28it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11783/41242 [38:17<1:33:01,  5.28it/s, training_loss=0.006]
Epoch 1:  29%|██▊       | 11784/41242 [38:17<1:34:35,  5.19it/s, training_loss=0.006]
Epoch 1:  29%|██▊       | 11784/41242 [38:18<1:34:35,  5.19it/s, training_loss=0.004]
Epoch 1:  29%|██▊       | 11785/41242 [38:18<1:34:18,  5.21it/s, training_loss=0.004]
Epoch 1:  29%|██▊       | 11785/41242 [38:18<1:34:18,  5.21it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11786/41242 [38:18<1:33:35,  5.25it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11786/41242 [38:18<1:33:35,  5.25it/s, training_loss=0.266]
Epoch 1:  29%|██▊       | 11787/41242 [38:18<1:33:25,  5.25it/s, training_loss=0.266]
Epoch 1:  29%|██▊       | 11787/41242 [38:18<1:33:25,  5.25it/s, training_loss=0.005]
Epoch 1:  29%|██▊       | 11788/41242 [38:18<1:33:15,  5.26it/s, training_loss=0.005]
Epoch 1:  29%|██▊       | 11788/41242 [38:18<1:33:15,  5.26it/s, training_loss=0.006]
Epoch 1:  29%|██▊       | 11789/41242 [38:18<1:33:08,  5.27it/s, training_loss=0.006]
Epoch 1:  29%|██▊       | 11789/41242 [38:19<1:33:08,  5.27it/s, training_loss=0.006]
Epoch 1:  29%|██▊       | 11790/41242 [38:19<1:33:31,  5.25it/s, training_loss=0.006]
Epoch 1:  29%|██▊       | 11790/41242 [38:19<1:33:31,  5.25it/s, training_loss=0.682]
Epoch 1:  29%|██▊       | 11791/41242 [38:19<1:33:49,  5.23it/s, training_loss=0.682]
Epoch 1:  29%|██▊       | 11791/41242 [38:19<1:33:49,  5.23it/s, training_loss=0.801]
Epoch 1:  29%|██▊       | 11792/41242 [38:19<1:33:29,  5.25it/s, training_loss=0.801]
Epoch 1:  29%|██▊       | 11792/41242 [38:19<1:33:29,  5.25it/s, training_loss=0.005]
Epoch 1:  29%|██▊       | 11793/41242 [38:19<1:33:01,  5.28it/s, training_loss=0.005]
Epoch 1:  29%|██▊       | 11793/41242 [38:19<1:33:01,  5.28it/s, training_loss=0.008]
Epoch 1:  29%|██▊       | 11794/41242 [38:19<1:32:27,  5.31it/s, training_loss=0.008]
Epoch 1:  29%|██▊       | 11794/41242 [38:19<1:32:27,  5.31it/s, training_loss=0.005]
Epoch 1:  29%|██▊       | 11795/41242 [38:19<1:31:38,  5.36it/s, training_loss=0.005]
Epoch 1:  29%|██▊       | 11795/41242 [38:20<1:31:38,  5.36it/s, training_loss=0.006]
Epoch 1:  29%|██▊       | 11796/41242 [38:20<1:31:31,  5.36it/s, training_loss=0.006]
Epoch 1:  29%|██▊       | 11796/41242 [38:20<1:31:31,  5.36it/s, training_loss=0.004]
Epoch 1:  29%|██▊       | 11797/41242 [38:20<1:33:46,  5.23it/s, training_loss=0.004]
Epoch 1:  29%|██▊       | 11797/41242 [38:20<1:33:46,  5.23it/s, training_loss=0.066]
Epoch 1:  29%|██▊       | 11798/41242 [38:20<1:33:52,  5.23it/s, training_loss=0.066]
Epoch 1:  29%|██▊       | 11798/41242 [38:20<1:33:52,  5.23it/s, training_loss=0.016]
Epoch 1:  29%|██▊       | 11799/41242 [38:20<1:34:13,  5.21it/s, training_loss=0.016]
Epoch 1:  29%|██▊       | 11799/41242 [38:20<1:34:13,  5.21it/s, training_loss=0.647]
Epoch 1:  29%|██▊       | 11800/41242 [38:20<1:33:14,  5.26it/s, training_loss=0.647]
Epoch 1:  29%|██▊       | 11800/41242 [38:21<1:33:14,  5.26it/s, training_loss=0.004]
Epoch 1:  29%|██▊       | 11801/41242 [38:21<1:32:28,  5.31it/s, training_loss=0.004]
Epoch 1:  29%|██▊       | 11801/41242 [38:21<1:32:28,  5.31it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11802/41242 [38:21<1:31:48,  5.34it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11802/41242 [38:21<1:31:48,  5.34it/s, training_loss=0.012]
Epoch 1:  29%|██▊       | 11803/41242 [38:21<1:32:09,  5.32it/s, training_loss=0.012]
Epoch 1:  29%|██▊       | 11803/41242 [38:21<1:32:09,  5.32it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11804/41242 [38:21<1:32:54,  5.28it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11804/41242 [38:21<1:32:54,  5.28it/s, training_loss=0.018]
Epoch 1:  29%|██▊       | 11805/41242 [38:21<1:33:06,  5.27it/s, training_loss=0.018]
Epoch 1:  29%|██▊       | 11805/41242 [38:22<1:33:06,  5.27it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11806/41242 [38:22<1:34:18,  5.20it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11806/41242 [38:22<1:34:18,  5.20it/s, training_loss=0.006]
Epoch 1:  29%|██▊       | 11807/41242 [38:22<1:34:13,  5.21it/s, training_loss=0.006]
Epoch 1:  29%|██▊       | 11807/41242 [38:22<1:34:13,  5.21it/s, training_loss=0.208]
Epoch 1:  29%|██▊       | 11808/41242 [38:22<1:34:16,  5.20it/s, training_loss=0.208]
Epoch 1:  29%|██▊       | 11808/41242 [38:22<1:34:16,  5.20it/s, training_loss=0.016]
Epoch 1:  29%|██▊       | 11809/41242 [38:22<1:35:58,  5.11it/s, training_loss=0.016]
Epoch 1:  29%|██▊       | 11809/41242 [38:22<1:35:58,  5.11it/s, training_loss=0.444]
Epoch 1:  29%|██▊       | 11810/41242 [38:22<1:36:49,  5.07it/s, training_loss=0.444]
Epoch 1:  29%|██▊       | 11810/41242 [38:23<1:36:49,  5.07it/s, training_loss=0.754]
Epoch 1:  29%|██▊       | 11811/41242 [38:23<1:36:13,  5.10it/s, training_loss=0.754]
Epoch 1:  29%|██▊       | 11811/41242 [38:23<1:36:13,  5.10it/s, training_loss=0.019]
Epoch 1:  29%|██▊       | 11812/41242 [38:23<1:35:36,  5.13it/s, training_loss=0.019]
Epoch 1:  29%|██▊       | 11812/41242 [38:23<1:35:36,  5.13it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11813/41242 [38:23<1:35:49,  5.12it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11813/41242 [38:23<1:35:49,  5.12it/s, training_loss=0.546]
Epoch 1:  29%|██▊       | 11814/41242 [38:23<1:35:06,  5.16it/s, training_loss=0.546]
Epoch 1:  29%|██▊       | 11814/41242 [38:23<1:35:06,  5.16it/s, training_loss=0.084]
Epoch 1:  29%|██▊       | 11815/41242 [38:23<1:34:24,  5.19it/s, training_loss=0.084]
Epoch 1:  29%|██▊       | 11815/41242 [38:24<1:34:24,  5.19it/s, training_loss=0.030]
Epoch 1:  29%|██▊       | 11816/41242 [38:24<1:36:23,  5.09it/s, training_loss=0.030]
Epoch 1:  29%|██▊       | 11816/41242 [38:24<1:36:23,  5.09it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11817/41242 [38:24<1:36:29,  5.08it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11817/41242 [38:24<1:36:29,  5.08it/s, training_loss=0.006]
Epoch 1:  29%|██▊       | 11818/41242 [38:24<1:35:44,  5.12it/s, training_loss=0.006]
Epoch 1:  29%|██▊       | 11818/41242 [38:24<1:35:44,  5.12it/s, training_loss=0.017]
Epoch 1:  29%|██▊       | 11819/41242 [38:24<1:36:28,  5.08it/s, training_loss=0.017]
Epoch 1:  29%|██▊       | 11819/41242 [38:24<1:36:28,  5.08it/s, training_loss=0.007]
Epoch 1:  29%|██▊       | 11820/41242 [38:24<1:36:02,  5.11it/s, training_loss=0.007]
Epoch 1:  29%|██▊       | 11820/41242 [38:25<1:36:02,  5.11it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11821/41242 [38:25<1:37:16,  5.04it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11821/41242 [38:25<1:37:16,  5.04it/s, training_loss=0.067]
Epoch 1:  29%|██▊       | 11822/41242 [38:25<1:36:46,  5.07it/s, training_loss=0.067]
Epoch 1:  29%|██▊       | 11822/41242 [38:25<1:36:46,  5.07it/s, training_loss=0.023]
Epoch 1:  29%|██▊       | 11823/41242 [38:25<1:38:47,  4.96it/s, training_loss=0.023]
Epoch 1:  29%|██▊       | 11823/41242 [38:25<1:38:47,  4.96it/s, training_loss=0.004]
Epoch 1:  29%|██▊       | 11824/41242 [38:25<1:38:14,  4.99it/s, training_loss=0.004]
Epoch 1:  29%|██▊       | 11824/41242 [38:25<1:38:14,  4.99it/s, training_loss=0.142]
Epoch 1:  29%|██▊       | 11825/41242 [38:25<1:37:56,  5.01it/s, training_loss=0.142]
Epoch 1:  29%|██▊       | 11825/41242 [38:26<1:37:56,  5.01it/s, training_loss=0.013]
Epoch 1:  29%|██▊       | 11826/41242 [38:26<1:36:16,  5.09it/s, training_loss=0.013]
Epoch 1:  29%|██▊       | 11826/41242 [38:26<1:36:16,  5.09it/s, training_loss=0.144]
Epoch 1:  29%|██▊       | 11827/41242 [38:26<1:35:10,  5.15it/s, training_loss=0.144]
Epoch 1:  29%|██▊       | 11827/41242 [38:26<1:35:10,  5.15it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11828/41242 [38:26<1:34:15,  5.20it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11828/41242 [38:26<1:34:15,  5.20it/s, training_loss=0.010]
Epoch 1:  29%|██▊       | 11829/41242 [38:26<1:33:32,  5.24it/s, training_loss=0.010]
Epoch 1:  29%|██▊       | 11829/41242 [38:26<1:33:32,  5.24it/s, training_loss=0.009]
Epoch 1:  29%|██▊       | 11830/41242 [38:26<1:33:14,  5.26it/s, training_loss=0.009]
Epoch 1:  29%|██▊       | 11830/41242 [38:26<1:33:14,  5.26it/s, training_loss=0.098]
Epoch 1:  29%|██▊       | 11831/41242 [38:26<1:33:22,  5.25it/s, training_loss=0.098]
Epoch 1:  29%|██▊       | 11831/41242 [38:27<1:33:22,  5.25it/s, training_loss=0.023]
Epoch 1:  29%|██▊       | 11832/41242 [38:27<1:34:32,  5.18it/s, training_loss=0.023]
Epoch 1:  29%|██▊       | 11832/41242 [38:27<1:34:32,  5.18it/s, training_loss=0.006]
Epoch 1:  29%|██▊       | 11833/41242 [38:27<1:33:26,  5.25it/s, training_loss=0.006]
Epoch 1:  29%|██▊       | 11833/41242 [38:27<1:33:26,  5.25it/s, training_loss=0.004]
Epoch 1:  29%|██▊       | 11834/41242 [38:27<1:32:48,  5.28it/s, training_loss=0.004]
Epoch 1:  29%|██▊       | 11834/41242 [38:27<1:32:48,  5.28it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11835/41242 [38:27<1:33:07,  5.26it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11835/41242 [38:27<1:33:07,  5.26it/s, training_loss=0.467]
Epoch 1:  29%|██▊       | 11836/41242 [38:27<1:32:43,  5.29it/s, training_loss=0.467]
Epoch 1:  29%|██▊       | 11836/41242 [38:28<1:32:43,  5.29it/s, training_loss=0.254]
Epoch 1:  29%|██▊       | 11837/41242 [38:28<1:33:19,  5.25it/s, training_loss=0.254]
Epoch 1:  29%|██▊       | 11837/41242 [38:28<1:33:19,  5.25it/s, training_loss=0.272]
Epoch 1:  29%|██▊       | 11838/41242 [38:28<1:35:21,  5.14it/s, training_loss=0.272]
Epoch 1:  29%|██▊       | 11838/41242 [38:28<1:35:21,  5.14it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11839/41242 [38:28<1:34:14,  5.20it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11839/41242 [38:28<1:34:14,  5.20it/s, training_loss=0.004]
Epoch 1:  29%|██▊       | 11840/41242 [38:28<1:33:46,  5.23it/s, training_loss=0.004]
Epoch 1:  29%|██▊       | 11840/41242 [38:28<1:33:46,  5.23it/s, training_loss=0.013]
Epoch 1:  29%|██▊       | 11841/41242 [38:28<1:33:35,  5.24it/s, training_loss=0.013]
Epoch 1:  29%|██▊       | 11841/41242 [38:29<1:33:35,  5.24it/s, training_loss=0.011]
Epoch 1:  29%|██▊       | 11842/41242 [38:29<1:32:42,  5.28it/s, training_loss=0.011]
Epoch 1:  29%|██▊       | 11842/41242 [38:29<1:32:42,  5.28it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11843/41242 [38:29<1:32:22,  5.30it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11843/41242 [38:29<1:32:22,  5.30it/s, training_loss=0.001]
Epoch 1:  29%|██▊       | 11844/41242 [38:29<1:32:19,  5.31it/s, training_loss=0.001]
Epoch 1:  29%|██▊       | 11844/41242 [38:29<1:32:19,  5.31it/s, training_loss=0.039]
Epoch 1:  29%|██▊       | 11845/41242 [38:29<1:33:16,  5.25it/s, training_loss=0.039]
Epoch 1:  29%|██▊       | 11845/41242 [38:29<1:33:16,  5.25it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11846/41242 [38:29<1:32:25,  5.30it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11846/41242 [38:29<1:32:25,  5.30it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11847/41242 [38:29<1:32:08,  5.32it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11847/41242 [38:30<1:32:08,  5.32it/s, training_loss=0.007]
Epoch 1:  29%|██▊       | 11848/41242 [38:30<1:33:08,  5.26it/s, training_loss=0.007]
Epoch 1:  29%|██▊       | 11848/41242 [38:30<1:33:08,  5.26it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11849/41242 [38:30<1:32:22,  5.30it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11849/41242 [38:30<1:32:22,  5.30it/s, training_loss=0.042]
Epoch 1:  29%|██▊       | 11850/41242 [38:30<1:33:01,  5.27it/s, training_loss=0.042]
Epoch 1:  29%|██▊       | 11850/41242 [38:30<1:33:01,  5.27it/s, training_loss=0.084]
Epoch 1:  29%|██▊       | 11851/41242 [38:30<1:34:38,  5.18it/s, training_loss=0.084]
Epoch 1:  29%|██▊       | 11851/41242 [38:30<1:34:38,  5.18it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11852/41242 [38:30<1:35:23,  5.14it/s, training_loss=0.003]
Epoch 1:  29%|██▊       | 11852/41242 [38:31<1:35:23,  5.14it/s, training_loss=0.049]
Epoch 1:  29%|██▊       | 11853/41242 [38:31<1:35:27,  5.13it/s, training_loss=0.049]
Epoch 1:  29%|██▊       | 11853/41242 [38:31<1:35:27,  5.13it/s, training_loss=0.047]
Epoch 1:  29%|██▊       | 11854/41242 [38:31<1:35:32,  5.13it/s, training_loss=0.047]
Epoch 1:  29%|██▊       | 11854/41242 [38:31<1:35:32,  5.13it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11855/41242 [38:31<1:34:28,  5.18it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11855/41242 [38:31<1:34:28,  5.18it/s, training_loss=0.576]
Epoch 1:  29%|██▊       | 11856/41242 [38:31<1:34:33,  5.18it/s, training_loss=0.576]
Epoch 1:  29%|██▊       | 11856/41242 [38:31<1:34:33,  5.18it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11857/41242 [38:31<1:34:33,  5.18it/s, training_loss=0.002]
Epoch 1:  29%|██▊       | 11857/41242 [38:32<1:34:33,  5.18it/s, training_loss=0.021]
Epoch 1:  29%|██▉       | 11858/41242 [38:32<1:39:04,  4.94it/s, training_loss=0.021]
Epoch 1:  29%|██▉       | 11858/41242 [38:32<1:39:04,  4.94it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11859/41242 [38:32<1:38:42,  4.96it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11859/41242 [38:32<1:38:42,  4.96it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11860/41242 [38:32<1:36:52,  5.05it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11860/41242 [38:32<1:36:52,  5.05it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11861/41242 [38:32<1:35:17,  5.14it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11861/41242 [38:32<1:35:17,  5.14it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 11862/41242 [38:32<1:37:01,  5.05it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 11862/41242 [38:33<1:37:01,  5.05it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11863/41242 [38:33<1:36:31,  5.07it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11863/41242 [38:33<1:36:31,  5.07it/s, training_loss=0.041]
Epoch 1:  29%|██▉       | 11864/41242 [38:33<1:36:51,  5.05it/s, training_loss=0.041]
Epoch 1:  29%|██▉       | 11864/41242 [38:33<1:36:51,  5.05it/s, training_loss=0.016]
Epoch 1:  29%|██▉       | 11865/41242 [38:33<1:35:21,  5.13it/s, training_loss=0.016]
Epoch 1:  29%|██▉       | 11865/41242 [38:33<1:35:21,  5.13it/s, training_loss=0.121]
Epoch 1:  29%|██▉       | 11866/41242 [38:33<1:34:50,  5.16it/s, training_loss=0.121]
Epoch 1:  29%|██▉       | 11866/41242 [38:33<1:34:50,  5.16it/s, training_loss=0.492]
Epoch 1:  29%|██▉       | 11867/41242 [38:33<1:34:14,  5.20it/s, training_loss=0.492]
Epoch 1:  29%|██▉       | 11867/41242 [38:34<1:34:14,  5.20it/s, training_loss=0.144]
Epoch 1:  29%|██▉       | 11868/41242 [38:34<1:34:14,  5.19it/s, training_loss=0.144]
Epoch 1:  29%|██▉       | 11868/41242 [38:34<1:34:14,  5.19it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11869/41242 [38:34<1:33:54,  5.21it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11869/41242 [38:34<1:33:54,  5.21it/s, training_loss=0.022]
Epoch 1:  29%|██▉       | 11870/41242 [38:34<1:33:00,  5.26it/s, training_loss=0.022]
Epoch 1:  29%|██▉       | 11870/41242 [38:34<1:33:00,  5.26it/s, training_loss=0.045]
Epoch 1:  29%|██▉       | 11871/41242 [38:34<1:33:03,  5.26it/s, training_loss=0.045]
Epoch 1:  29%|██▉       | 11871/41242 [38:34<1:33:03,  5.26it/s, training_loss=0.010]
Epoch 1:  29%|██▉       | 11872/41242 [38:34<1:34:25,  5.18it/s, training_loss=0.010]
Epoch 1:  29%|██▉       | 11872/41242 [38:35<1:34:25,  5.18it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11873/41242 [38:35<1:34:30,  5.18it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11873/41242 [38:35<1:34:30,  5.18it/s, training_loss=0.385]
Epoch 1:  29%|██▉       | 11874/41242 [38:35<1:36:05,  5.09it/s, training_loss=0.385]
Epoch 1:  29%|██▉       | 11874/41242 [38:35<1:36:05,  5.09it/s, training_loss=0.020]
Epoch 1:  29%|██▉       | 11875/41242 [38:35<1:35:29,  5.13it/s, training_loss=0.020]
Epoch 1:  29%|██▉       | 11875/41242 [38:35<1:35:29,  5.13it/s, training_loss=0.164]
Epoch 1:  29%|██▉       | 11876/41242 [38:35<1:37:11,  5.04it/s, training_loss=0.164]
Epoch 1:  29%|██▉       | 11876/41242 [38:35<1:37:11,  5.04it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11877/41242 [38:35<1:36:16,  5.08it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11877/41242 [38:36<1:36:16,  5.08it/s, training_loss=0.701]
Epoch 1:  29%|██▉       | 11878/41242 [38:36<1:35:25,  5.13it/s, training_loss=0.701]
Epoch 1:  29%|██▉       | 11878/41242 [38:36<1:35:25,  5.13it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11879/41242 [38:36<1:36:02,  5.10it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11879/41242 [38:36<1:36:02,  5.10it/s, training_loss=0.361]
Epoch 1:  29%|██▉       | 11880/41242 [38:36<1:36:40,  5.06it/s, training_loss=0.361]
Epoch 1:  29%|██▉       | 11880/41242 [38:36<1:36:40,  5.06it/s, training_loss=0.045]
Epoch 1:  29%|██▉       | 11881/41242 [38:36<1:37:47,  5.00it/s, training_loss=0.045]
Epoch 1:  29%|██▉       | 11881/41242 [38:36<1:37:47,  5.00it/s, training_loss=0.008]
Epoch 1:  29%|██▉       | 11882/41242 [38:36<1:36:56,  5.05it/s, training_loss=0.008]
Epoch 1:  29%|██▉       | 11882/41242 [38:37<1:36:56,  5.05it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 11883/41242 [38:37<1:36:03,  5.09it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 11883/41242 [38:37<1:36:03,  5.09it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11884/41242 [38:37<1:35:30,  5.12it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11884/41242 [38:37<1:35:30,  5.12it/s, training_loss=0.007]
Epoch 1:  29%|██▉       | 11885/41242 [38:37<1:34:19,  5.19it/s, training_loss=0.007]
Epoch 1:  29%|██▉       | 11885/41242 [38:37<1:34:19,  5.19it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11886/41242 [38:37<1:33:03,  5.26it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11886/41242 [38:37<1:33:03,  5.26it/s, training_loss=0.008]
Epoch 1:  29%|██▉       | 11887/41242 [38:37<1:33:27,  5.24it/s, training_loss=0.008]
Epoch 1:  29%|██▉       | 11887/41242 [38:37<1:33:27,  5.24it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11888/41242 [38:37<1:33:07,  5.25it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11888/41242 [38:38<1:33:07,  5.25it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11889/41242 [38:38<1:32:35,  5.28it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11889/41242 [38:38<1:32:35,  5.28it/s, training_loss=0.075]
Epoch 1:  29%|██▉       | 11890/41242 [38:38<1:33:34,  5.23it/s, training_loss=0.075]
Epoch 1:  29%|██▉       | 11890/41242 [38:38<1:33:34,  5.23it/s, training_loss=0.185]
Epoch 1:  29%|██▉       | 11891/41242 [38:38<1:34:50,  5.16it/s, training_loss=0.185]
Epoch 1:  29%|██▉       | 11891/41242 [38:38<1:34:50,  5.16it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11892/41242 [38:38<1:36:01,  5.09it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11892/41242 [38:38<1:36:01,  5.09it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11893/41242 [38:38<1:35:12,  5.14it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11893/41242 [38:39<1:35:12,  5.14it/s, training_loss=0.010]
Epoch 1:  29%|██▉       | 11894/41242 [38:39<1:37:20,  5.02it/s, training_loss=0.010]
Epoch 1:  29%|██▉       | 11894/41242 [38:39<1:37:20,  5.02it/s, training_loss=0.058]
Epoch 1:  29%|██▉       | 11895/41242 [38:39<1:38:22,  4.97it/s, training_loss=0.058]
Epoch 1:  29%|██▉       | 11895/41242 [38:39<1:38:22,  4.97it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11896/41242 [38:39<1:36:46,  5.05it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11896/41242 [38:39<1:36:46,  5.05it/s, training_loss=0.010]
Epoch 1:  29%|██▉       | 11897/41242 [38:39<1:36:22,  5.07it/s, training_loss=0.010]
Epoch 1:  29%|██▉       | 11897/41242 [38:39<1:36:22,  5.07it/s, training_loss=0.001]
Epoch 1:  29%|██▉       | 11898/41242 [38:39<1:36:04,  5.09it/s, training_loss=0.001]
Epoch 1:  29%|██▉       | 11898/41242 [38:40<1:36:04,  5.09it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11899/41242 [38:40<1:36:43,  5.06it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11899/41242 [38:40<1:36:43,  5.06it/s, training_loss=0.137]
Epoch 1:  29%|██▉       | 11900/41242 [38:40<1:37:22,  5.02it/s, training_loss=0.137]
Epoch 1:  29%|██▉       | 11900/41242 [38:40<1:37:22,  5.02it/s, training_loss=0.119]
Epoch 1:  29%|██▉       | 11901/41242 [38:40<1:38:39,  4.96it/s, training_loss=0.119]
Epoch 1:  29%|██▉       | 11901/41242 [38:40<1:38:39,  4.96it/s, training_loss=0.319]
Epoch 1:  29%|██▉       | 11902/41242 [38:40<1:38:32,  4.96it/s, training_loss=0.319]
Epoch 1:  29%|██▉       | 11902/41242 [38:40<1:38:32,  4.96it/s, training_loss=0.538]
Epoch 1:  29%|██▉       | 11903/41242 [38:40<1:37:34,  5.01it/s, training_loss=0.538]
Epoch 1:  29%|██▉       | 11903/41242 [38:41<1:37:34,  5.01it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11904/41242 [38:41<1:36:37,  5.06it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11904/41242 [38:41<1:36:37,  5.06it/s, training_loss=0.027]
Epoch 1:  29%|██▉       | 11905/41242 [38:41<1:35:44,  5.11it/s, training_loss=0.027]
Epoch 1:  29%|██▉       | 11905/41242 [38:41<1:35:44,  5.11it/s, training_loss=0.201]
Epoch 1:  29%|██▉       | 11906/41242 [38:41<1:34:54,  5.15it/s, training_loss=0.201]
Epoch 1:  29%|██▉       | 11906/41242 [38:41<1:34:54,  5.15it/s, training_loss=0.011]
Epoch 1:  29%|██▉       | 11907/41242 [38:41<1:35:08,  5.14it/s, training_loss=0.011]
Epoch 1:  29%|██▉       | 11907/41242 [38:41<1:35:08,  5.14it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11908/41242 [38:41<1:36:04,  5.09it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11908/41242 [38:42<1:36:04,  5.09it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11909/41242 [38:42<1:35:23,  5.12it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11909/41242 [38:42<1:35:23,  5.12it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11910/41242 [38:42<1:35:08,  5.14it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11910/41242 [38:42<1:35:08,  5.14it/s, training_loss=0.009]
Epoch 1:  29%|██▉       | 11911/41242 [38:42<1:34:12,  5.19it/s, training_loss=0.009]
Epoch 1:  29%|██▉       | 11911/41242 [38:42<1:34:12,  5.19it/s, training_loss=1.513]
Epoch 1:  29%|██▉       | 11912/41242 [38:42<1:33:46,  5.21it/s, training_loss=1.513]
Epoch 1:  29%|██▉       | 11912/41242 [38:42<1:33:46,  5.21it/s, training_loss=0.696]
Epoch 1:  29%|██▉       | 11913/41242 [38:42<1:34:20,  5.18it/s, training_loss=0.696]
Epoch 1:  29%|██▉       | 11913/41242 [38:43<1:34:20,  5.18it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11914/41242 [38:43<1:36:19,  5.07it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11914/41242 [38:43<1:36:19,  5.07it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11915/41242 [38:43<1:37:23,  5.02it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11915/41242 [38:43<1:37:23,  5.02it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11916/41242 [38:43<1:37:08,  5.03it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11916/41242 [38:43<1:37:08,  5.03it/s, training_loss=0.045]
Epoch 1:  29%|██▉       | 11917/41242 [38:43<1:36:33,  5.06it/s, training_loss=0.045]
Epoch 1:  29%|██▉       | 11917/41242 [38:43<1:36:33,  5.06it/s, training_loss=0.009]
Epoch 1:  29%|██▉       | 11918/41242 [38:43<1:35:12,  5.13it/s, training_loss=0.009]
Epoch 1:  29%|██▉       | 11918/41242 [38:44<1:35:12,  5.13it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11919/41242 [38:44<1:33:51,  5.21it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11919/41242 [38:44<1:33:51,  5.21it/s, training_loss=0.020]
Epoch 1:  29%|██▉       | 11920/41242 [38:44<1:33:56,  5.20it/s, training_loss=0.020]
Epoch 1:  29%|██▉       | 11920/41242 [38:44<1:33:56,  5.20it/s, training_loss=0.010]
Epoch 1:  29%|██▉       | 11921/41242 [38:44<1:35:24,  5.12it/s, training_loss=0.010]
Epoch 1:  29%|██▉       | 11921/41242 [38:44<1:35:24,  5.12it/s, training_loss=0.050]
Epoch 1:  29%|██▉       | 11922/41242 [38:44<1:34:37,  5.16it/s, training_loss=0.050]
Epoch 1:  29%|██▉       | 11922/41242 [38:44<1:34:37,  5.16it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11923/41242 [38:44<1:33:21,  5.23it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11923/41242 [38:45<1:33:21,  5.23it/s, training_loss=0.497]
Epoch 1:  29%|██▉       | 11924/41242 [38:45<1:33:03,  5.25it/s, training_loss=0.497]
Epoch 1:  29%|██▉       | 11924/41242 [38:45<1:33:03,  5.25it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11925/41242 [38:45<1:32:30,  5.28it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11925/41242 [38:45<1:32:30,  5.28it/s, training_loss=0.357]
Epoch 1:  29%|██▉       | 11926/41242 [38:45<1:32:23,  5.29it/s, training_loss=0.357]
Epoch 1:  29%|██▉       | 11926/41242 [38:45<1:32:23,  5.29it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11927/41242 [38:45<1:31:45,  5.33it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11927/41242 [38:45<1:31:45,  5.33it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11928/41242 [38:45<1:31:15,  5.35it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11928/41242 [38:45<1:31:15,  5.35it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11929/41242 [38:45<1:31:00,  5.37it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11929/41242 [38:46<1:31:00,  5.37it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11930/41242 [38:46<1:32:04,  5.31it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11930/41242 [38:46<1:32:04,  5.31it/s, training_loss=0.013]
Epoch 1:  29%|██▉       | 11931/41242 [38:46<1:32:59,  5.25it/s, training_loss=0.013]
Epoch 1:  29%|██▉       | 11931/41242 [38:46<1:32:59,  5.25it/s, training_loss=0.037]
Epoch 1:  29%|██▉       | 11932/41242 [38:46<1:34:32,  5.17it/s, training_loss=0.037]
Epoch 1:  29%|██▉       | 11932/41242 [38:46<1:34:32,  5.17it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11933/41242 [38:46<1:34:20,  5.18it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11933/41242 [38:46<1:34:20,  5.18it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 11934/41242 [38:46<1:33:33,  5.22it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 11934/41242 [38:47<1:33:33,  5.22it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11935/41242 [38:47<1:33:21,  5.23it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11935/41242 [38:47<1:33:21,  5.23it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 11936/41242 [38:47<1:32:57,  5.25it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 11936/41242 [38:47<1:32:57,  5.25it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11937/41242 [38:47<1:32:10,  5.30it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11937/41242 [38:47<1:32:10,  5.30it/s, training_loss=0.015]
Epoch 1:  29%|██▉       | 11938/41242 [38:47<1:34:13,  5.18it/s, training_loss=0.015]
Epoch 1:  29%|██▉       | 11938/41242 [38:47<1:34:13,  5.18it/s, training_loss=0.018]
Epoch 1:  29%|██▉       | 11939/41242 [38:47<1:33:50,  5.20it/s, training_loss=0.018]
Epoch 1:  29%|██▉       | 11939/41242 [38:48<1:33:50,  5.20it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11940/41242 [38:48<1:34:20,  5.18it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11940/41242 [38:48<1:34:20,  5.18it/s, training_loss=0.013]
Epoch 1:  29%|██▉       | 11941/41242 [38:48<1:33:34,  5.22it/s, training_loss=0.013]
Epoch 1:  29%|██▉       | 11941/41242 [38:48<1:33:34,  5.22it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11942/41242 [38:48<1:32:56,  5.25it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11942/41242 [38:48<1:32:56,  5.25it/s, training_loss=0.024]
Epoch 1:  29%|██▉       | 11943/41242 [38:48<1:34:11,  5.18it/s, training_loss=0.024]
Epoch 1:  29%|██▉       | 11943/41242 [38:48<1:34:11,  5.18it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11944/41242 [38:48<1:33:36,  5.22it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11944/41242 [38:49<1:33:36,  5.22it/s, training_loss=0.011]
Epoch 1:  29%|██▉       | 11945/41242 [38:49<1:33:54,  5.20it/s, training_loss=0.011]
Epoch 1:  29%|██▉       | 11945/41242 [38:49<1:33:54,  5.20it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11946/41242 [38:49<1:33:13,  5.24it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11946/41242 [38:49<1:33:13,  5.24it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11947/41242 [38:49<1:32:46,  5.26it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11947/41242 [38:49<1:32:46,  5.26it/s, training_loss=0.085]
Epoch 1:  29%|██▉       | 11948/41242 [38:49<1:33:17,  5.23it/s, training_loss=0.085]
Epoch 1:  29%|██▉       | 11948/41242 [38:49<1:33:17,  5.23it/s, training_loss=0.407]
Epoch 1:  29%|██▉       | 11949/41242 [38:49<1:33:12,  5.24it/s, training_loss=0.407]
Epoch 1:  29%|██▉       | 11949/41242 [38:49<1:33:12,  5.24it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11950/41242 [38:49<1:33:15,  5.23it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11950/41242 [38:50<1:33:15,  5.23it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11951/41242 [38:50<1:32:24,  5.28it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11951/41242 [38:50<1:32:24,  5.28it/s, training_loss=0.627]
Epoch 1:  29%|██▉       | 11952/41242 [38:50<1:33:30,  5.22it/s, training_loss=0.627]
Epoch 1:  29%|██▉       | 11952/41242 [38:50<1:33:30,  5.22it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11953/41242 [38:50<1:32:36,  5.27it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11953/41242 [38:50<1:32:36,  5.27it/s, training_loss=0.016]
Epoch 1:  29%|██▉       | 11954/41242 [38:50<1:33:39,  5.21it/s, training_loss=0.016]
Epoch 1:  29%|██▉       | 11954/41242 [38:50<1:33:39,  5.21it/s, training_loss=0.081]
Epoch 1:  29%|██▉       | 11955/41242 [38:50<1:34:17,  5.18it/s, training_loss=0.081]
Epoch 1:  29%|██▉       | 11955/41242 [38:51<1:34:17,  5.18it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11956/41242 [38:51<1:34:24,  5.17it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11956/41242 [38:51<1:34:24,  5.17it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11957/41242 [38:51<1:35:16,  5.12it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11957/41242 [38:51<1:35:16,  5.12it/s, training_loss=0.008]
Epoch 1:  29%|██▉       | 11958/41242 [38:51<1:35:53,  5.09it/s, training_loss=0.008]
Epoch 1:  29%|██▉       | 11958/41242 [38:51<1:35:53,  5.09it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11959/41242 [38:51<1:34:47,  5.15it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11959/41242 [38:51<1:34:47,  5.15it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11960/41242 [38:51<1:32:50,  5.26it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11960/41242 [38:52<1:32:50,  5.26it/s, training_loss=0.053]
Epoch 1:  29%|██▉       | 11961/41242 [38:52<1:32:29,  5.28it/s, training_loss=0.053]
Epoch 1:  29%|██▉       | 11961/41242 [38:52<1:32:29,  5.28it/s, training_loss=0.007]
Epoch 1:  29%|██▉       | 11962/41242 [38:52<1:33:02,  5.24it/s, training_loss=0.007]
Epoch 1:  29%|██▉       | 11962/41242 [38:52<1:33:02,  5.24it/s, training_loss=0.122]
Epoch 1:  29%|██▉       | 11963/41242 [38:52<1:33:00,  5.25it/s, training_loss=0.122]
Epoch 1:  29%|██▉       | 11963/41242 [38:52<1:33:00,  5.25it/s, training_loss=0.058]
Epoch 1:  29%|██▉       | 11964/41242 [38:52<1:32:40,  5.27it/s, training_loss=0.058]
Epoch 1:  29%|██▉       | 11964/41242 [38:52<1:32:40,  5.27it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 11965/41242 [38:52<1:32:29,  5.28it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 11965/41242 [38:53<1:32:29,  5.28it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11966/41242 [38:53<1:32:45,  5.26it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11966/41242 [38:53<1:32:45,  5.26it/s, training_loss=0.580]
Epoch 1:  29%|██▉       | 11967/41242 [38:53<1:32:40,  5.27it/s, training_loss=0.580]
Epoch 1:  29%|██▉       | 11967/41242 [38:53<1:32:40,  5.27it/s, training_loss=0.544]
Epoch 1:  29%|██▉       | 11968/41242 [38:53<1:33:35,  5.21it/s, training_loss=0.544]
Epoch 1:  29%|██▉       | 11968/41242 [38:53<1:33:35,  5.21it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11969/41242 [38:53<1:33:24,  5.22it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11969/41242 [38:53<1:33:24,  5.22it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11970/41242 [38:53<1:32:59,  5.25it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11970/41242 [38:53<1:32:59,  5.25it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11971/41242 [38:53<1:32:41,  5.26it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11971/41242 [38:54<1:32:41,  5.26it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11972/41242 [38:54<1:32:30,  5.27it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11972/41242 [38:54<1:32:30,  5.27it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11973/41242 [38:54<1:33:44,  5.20it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11973/41242 [38:54<1:33:44,  5.20it/s, training_loss=0.017]
Epoch 1:  29%|██▉       | 11974/41242 [38:54<1:34:32,  5.16it/s, training_loss=0.017]
Epoch 1:  29%|██▉       | 11974/41242 [38:54<1:34:32,  5.16it/s, training_loss=0.007]
Epoch 1:  29%|██▉       | 11975/41242 [38:54<1:34:14,  5.18it/s, training_loss=0.007]
Epoch 1:  29%|██▉       | 11975/41242 [38:54<1:34:14,  5.18it/s, training_loss=0.055]
Epoch 1:  29%|██▉       | 11976/41242 [38:54<1:34:41,  5.15it/s, training_loss=0.055]
Epoch 1:  29%|██▉       | 11976/41242 [38:55<1:34:41,  5.15it/s, training_loss=0.001]
Epoch 1:  29%|██▉       | 11977/41242 [38:55<1:33:20,  5.23it/s, training_loss=0.001]
Epoch 1:  29%|██▉       | 11977/41242 [38:55<1:33:20,  5.23it/s, training_loss=0.032]
Epoch 1:  29%|██▉       | 11978/41242 [38:55<1:33:11,  5.23it/s, training_loss=0.032]
Epoch 1:  29%|██▉       | 11978/41242 [38:55<1:33:11,  5.23it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11979/41242 [38:55<1:33:23,  5.22it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11979/41242 [38:55<1:33:23,  5.22it/s, training_loss=0.022]
Epoch 1:  29%|██▉       | 11980/41242 [38:55<1:33:22,  5.22it/s, training_loss=0.022]
Epoch 1:  29%|██▉       | 11980/41242 [38:55<1:33:22,  5.22it/s, training_loss=0.460]
Epoch 1:  29%|██▉       | 11981/41242 [38:55<1:34:23,  5.17it/s, training_loss=0.460]
Epoch 1:  29%|██▉       | 11981/41242 [38:56<1:34:23,  5.17it/s, training_loss=0.007]
Epoch 1:  29%|██▉       | 11982/41242 [38:56<1:34:23,  5.17it/s, training_loss=0.007]
Epoch 1:  29%|██▉       | 11982/41242 [38:56<1:34:23,  5.17it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11983/41242 [38:56<1:35:33,  5.10it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11983/41242 [38:56<1:35:33,  5.10it/s, training_loss=0.010]
Epoch 1:  29%|██▉       | 11984/41242 [38:56<1:34:30,  5.16it/s, training_loss=0.010]
Epoch 1:  29%|██▉       | 11984/41242 [38:56<1:34:30,  5.16it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11985/41242 [38:56<1:35:51,  5.09it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11985/41242 [38:56<1:35:51,  5.09it/s, training_loss=0.016]
Epoch 1:  29%|██▉       | 11986/41242 [38:56<1:39:16,  4.91it/s, training_loss=0.016]
Epoch 1:  29%|██▉       | 11986/41242 [38:57<1:39:16,  4.91it/s, training_loss=0.007]
Epoch 1:  29%|██▉       | 11987/41242 [38:57<1:38:29,  4.95it/s, training_loss=0.007]
Epoch 1:  29%|██▉       | 11987/41242 [38:57<1:38:29,  4.95it/s, training_loss=0.012]
Epoch 1:  29%|██▉       | 11988/41242 [38:57<1:37:45,  4.99it/s, training_loss=0.012]
Epoch 1:  29%|██▉       | 11988/41242 [38:57<1:37:45,  4.99it/s, training_loss=0.419]
Epoch 1:  29%|██▉       | 11989/41242 [38:57<1:38:24,  4.95it/s, training_loss=0.419]
Epoch 1:  29%|██▉       | 11989/41242 [38:57<1:38:24,  4.95it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 11990/41242 [38:57<1:38:43,  4.94it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 11990/41242 [38:57<1:38:43,  4.94it/s, training_loss=0.037]
Epoch 1:  29%|██▉       | 11991/41242 [38:57<1:38:16,  4.96it/s, training_loss=0.037]
Epoch 1:  29%|██▉       | 11991/41242 [38:58<1:38:16,  4.96it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11992/41242 [38:58<1:38:49,  4.93it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 11992/41242 [38:58<1:38:49,  4.93it/s, training_loss=0.240]
Epoch 1:  29%|██▉       | 11993/41242 [38:58<1:37:51,  4.98it/s, training_loss=0.240]
Epoch 1:  29%|██▉       | 11993/41242 [38:58<1:37:51,  4.98it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11994/41242 [38:58<1:37:26,  5.00it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11994/41242 [38:58<1:37:26,  5.00it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11995/41242 [38:58<1:36:05,  5.07it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 11995/41242 [38:58<1:36:05,  5.07it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11996/41242 [38:58<1:35:34,  5.10it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11996/41242 [38:59<1:35:34,  5.10it/s, training_loss=0.059]
Epoch 1:  29%|██▉       | 11997/41242 [38:59<1:35:58,  5.08it/s, training_loss=0.059]
Epoch 1:  29%|██▉       | 11997/41242 [38:59<1:35:58,  5.08it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11998/41242 [38:59<1:36:53,  5.03it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 11998/41242 [38:59<1:36:53,  5.03it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11999/41242 [38:59<1:35:03,  5.13it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 11999/41242 [38:59<1:35:03,  5.13it/s, training_loss=0.013]
Epoch 1:  29%|██▉       | 12000/41242 [38:59<1:33:54,  5.19it/s, training_loss=0.013]
Epoch 1:  29%|██▉       | 12000/41242 [38:59<1:33:54,  5.19it/s, training_loss=0.007]
Epoch 1:  29%|██▉       | 12001/41242 [38:59<1:32:51,  5.25it/s, training_loss=0.007]
Epoch 1:  29%|██▉       | 12001/41242 [39:00<1:32:51,  5.25it/s, training_loss=0.301]
Epoch 1:  29%|██▉       | 12002/41242 [39:00<1:32:41,  5.26it/s, training_loss=0.301]
Epoch 1:  29%|██▉       | 12002/41242 [39:00<1:32:41,  5.26it/s, training_loss=0.553]
Epoch 1:  29%|██▉       | 12003/41242 [39:00<1:33:05,  5.24it/s, training_loss=0.553]
Epoch 1:  29%|██▉       | 12003/41242 [39:00<1:33:05,  5.24it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12004/41242 [39:00<1:32:51,  5.25it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12004/41242 [39:00<1:32:51,  5.25it/s, training_loss=0.031]
Epoch 1:  29%|██▉       | 12005/41242 [39:00<1:33:18,  5.22it/s, training_loss=0.031]
Epoch 1:  29%|██▉       | 12005/41242 [39:00<1:33:18,  5.22it/s, training_loss=0.019]
Epoch 1:  29%|██▉       | 12006/41242 [39:00<1:35:53,  5.08it/s, training_loss=0.019]
Epoch 1:  29%|██▉       | 12006/41242 [39:01<1:35:53,  5.08it/s, training_loss=0.296]
Epoch 1:  29%|██▉       | 12007/41242 [39:01<1:36:02,  5.07it/s, training_loss=0.296]
Epoch 1:  29%|██▉       | 12007/41242 [39:01<1:36:02,  5.07it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12008/41242 [39:01<1:34:24,  5.16it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12008/41242 [39:01<1:34:24,  5.16it/s, training_loss=0.009]
Epoch 1:  29%|██▉       | 12009/41242 [39:01<1:33:33,  5.21it/s, training_loss=0.009]
Epoch 1:  29%|██▉       | 12009/41242 [39:01<1:33:33,  5.21it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12010/41242 [39:01<1:34:53,  5.13it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12010/41242 [39:01<1:34:53,  5.13it/s, training_loss=0.016]
Epoch 1:  29%|██▉       | 12011/41242 [39:01<1:35:22,  5.11it/s, training_loss=0.016]
Epoch 1:  29%|██▉       | 12011/41242 [39:02<1:35:22,  5.11it/s, training_loss=0.014]
Epoch 1:  29%|██▉       | 12012/41242 [39:02<1:34:28,  5.16it/s, training_loss=0.014]
Epoch 1:  29%|██▉       | 12012/41242 [39:02<1:34:28,  5.16it/s, training_loss=0.071]
Epoch 1:  29%|██▉       | 12013/41242 [39:02<1:34:02,  5.18it/s, training_loss=0.071]
Epoch 1:  29%|██▉       | 12013/41242 [39:02<1:34:02,  5.18it/s, training_loss=0.392]
Epoch 1:  29%|██▉       | 12014/41242 [39:02<1:33:36,  5.20it/s, training_loss=0.392]
Epoch 1:  29%|██▉       | 12014/41242 [39:02<1:33:36,  5.20it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 12015/41242 [39:02<1:36:39,  5.04it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 12015/41242 [39:02<1:36:39,  5.04it/s, training_loss=0.386]
Epoch 1:  29%|██▉       | 12016/41242 [39:02<1:36:00,  5.07it/s, training_loss=0.386]
Epoch 1:  29%|██▉       | 12016/41242 [39:02<1:36:00,  5.07it/s, training_loss=0.013]
Epoch 1:  29%|██▉       | 12017/41242 [39:02<1:35:34,  5.10it/s, training_loss=0.013]
Epoch 1:  29%|██▉       | 12017/41242 [39:03<1:35:34,  5.10it/s, training_loss=0.026]
Epoch 1:  29%|██▉       | 12018/41242 [39:03<1:34:33,  5.15it/s, training_loss=0.026]
Epoch 1:  29%|██▉       | 12018/41242 [39:03<1:34:33,  5.15it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12019/41242 [39:03<1:34:15,  5.17it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12019/41242 [39:03<1:34:15,  5.17it/s, training_loss=0.008]
Epoch 1:  29%|██▉       | 12020/41242 [39:03<1:33:56,  5.18it/s, training_loss=0.008]
Epoch 1:  29%|██▉       | 12020/41242 [39:03<1:33:56,  5.18it/s, training_loss=0.428]
Epoch 1:  29%|██▉       | 12021/41242 [39:03<1:35:34,  5.10it/s, training_loss=0.428]
Epoch 1:  29%|██▉       | 12021/41242 [39:03<1:35:34,  5.10it/s, training_loss=0.103]
Epoch 1:  29%|██▉       | 12022/41242 [39:03<1:36:06,  5.07it/s, training_loss=0.103]
Epoch 1:  29%|██▉       | 12022/41242 [39:04<1:36:06,  5.07it/s, training_loss=0.077]
Epoch 1:  29%|██▉       | 12023/41242 [39:04<1:35:35,  5.09it/s, training_loss=0.077]
Epoch 1:  29%|██▉       | 12023/41242 [39:04<1:35:35,  5.09it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12024/41242 [39:04<1:35:09,  5.12it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12024/41242 [39:04<1:35:09,  5.12it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12025/41242 [39:04<1:35:38,  5.09it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12025/41242 [39:04<1:35:38,  5.09it/s, training_loss=0.020]
Epoch 1:  29%|██▉       | 12026/41242 [39:04<1:36:39,  5.04it/s, training_loss=0.020]
Epoch 1:  29%|██▉       | 12026/41242 [39:04<1:36:39,  5.04it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12027/41242 [39:04<1:36:29,  5.05it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12027/41242 [39:05<1:36:29,  5.05it/s, training_loss=0.008]
Epoch 1:  29%|██▉       | 12028/41242 [39:05<1:38:17,  4.95it/s, training_loss=0.008]
Epoch 1:  29%|██▉       | 12028/41242 [39:05<1:38:17,  4.95it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12029/41242 [39:05<1:35:51,  5.08it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12029/41242 [39:05<1:35:51,  5.08it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12030/41242 [39:05<1:34:21,  5.16it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12030/41242 [39:05<1:34:21,  5.16it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12031/41242 [39:05<1:33:48,  5.19it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12031/41242 [39:05<1:33:48,  5.19it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 12032/41242 [39:05<1:34:57,  5.13it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 12032/41242 [39:06<1:34:57,  5.13it/s, training_loss=0.232]
Epoch 1:  29%|██▉       | 12033/41242 [39:06<1:34:20,  5.16it/s, training_loss=0.232]
Epoch 1:  29%|██▉       | 12033/41242 [39:06<1:34:20,  5.16it/s, training_loss=0.009]
Epoch 1:  29%|██▉       | 12034/41242 [39:06<1:35:09,  5.12it/s, training_loss=0.009]
Epoch 1:  29%|██▉       | 12034/41242 [39:06<1:35:09,  5.12it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12035/41242 [39:06<1:35:48,  5.08it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12035/41242 [39:06<1:35:48,  5.08it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12036/41242 [39:06<1:36:55,  5.02it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12036/41242 [39:06<1:36:55,  5.02it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12037/41242 [39:06<1:38:45,  4.93it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12037/41242 [39:07<1:38:45,  4.93it/s, training_loss=0.291]
Epoch 1:  29%|██▉       | 12038/41242 [39:07<1:37:05,  5.01it/s, training_loss=0.291]
Epoch 1:  29%|██▉       | 12038/41242 [39:07<1:37:05,  5.01it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12039/41242 [39:07<1:36:03,  5.07it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12039/41242 [39:07<1:36:03,  5.07it/s, training_loss=0.011]
Epoch 1:  29%|██▉       | 12040/41242 [39:07<1:35:37,  5.09it/s, training_loss=0.011]
Epoch 1:  29%|██▉       | 12040/41242 [39:07<1:35:37,  5.09it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12041/41242 [39:07<1:33:54,  5.18it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12041/41242 [39:07<1:33:54,  5.18it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12042/41242 [39:07<1:32:55,  5.24it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12042/41242 [39:08<1:32:55,  5.24it/s, training_loss=0.347]
Epoch 1:  29%|██▉       | 12043/41242 [39:08<1:33:23,  5.21it/s, training_loss=0.347]
Epoch 1:  29%|██▉       | 12043/41242 [39:08<1:33:23,  5.21it/s, training_loss=0.009]
Epoch 1:  29%|██▉       | 12044/41242 [39:08<1:32:55,  5.24it/s, training_loss=0.009]
Epoch 1:  29%|██▉       | 12044/41242 [39:08<1:32:55,  5.24it/s, training_loss=0.023]
Epoch 1:  29%|██▉       | 12045/41242 [39:08<1:32:49,  5.24it/s, training_loss=0.023]
Epoch 1:  29%|██▉       | 12045/41242 [39:08<1:32:49,  5.24it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12046/41242 [39:08<1:32:35,  5.26it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12046/41242 [39:08<1:32:35,  5.26it/s, training_loss=0.421]
Epoch 1:  29%|██▉       | 12047/41242 [39:08<1:32:40,  5.25it/s, training_loss=0.421]
Epoch 1:  29%|██▉       | 12047/41242 [39:09<1:32:40,  5.25it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12048/41242 [39:09<1:32:32,  5.26it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12048/41242 [39:09<1:32:32,  5.26it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 12049/41242 [39:09<1:32:28,  5.26it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 12049/41242 [39:09<1:32:28,  5.26it/s, training_loss=0.025]
Epoch 1:  29%|██▉       | 12050/41242 [39:09<1:35:19,  5.10it/s, training_loss=0.025]
Epoch 1:  29%|██▉       | 12050/41242 [39:09<1:35:19,  5.10it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12051/41242 [39:09<1:35:35,  5.09it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12051/41242 [39:09<1:35:35,  5.09it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12052/41242 [39:09<1:34:26,  5.15it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12052/41242 [39:10<1:34:26,  5.15it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12053/41242 [39:10<1:34:27,  5.15it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12053/41242 [39:10<1:34:27,  5.15it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12054/41242 [39:10<1:35:00,  5.12it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12054/41242 [39:10<1:35:00,  5.12it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12055/41242 [39:10<1:35:31,  5.09it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12055/41242 [39:10<1:35:31,  5.09it/s, training_loss=0.060]
Epoch 1:  29%|██▉       | 12056/41242 [39:10<1:36:20,  5.05it/s, training_loss=0.060]
Epoch 1:  29%|██▉       | 12056/41242 [39:10<1:36:20,  5.05it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 12057/41242 [39:10<1:35:39,  5.09it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 12057/41242 [39:10<1:35:39,  5.09it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12058/41242 [39:10<1:34:25,  5.15it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12058/41242 [39:11<1:34:25,  5.15it/s, training_loss=0.009]
Epoch 1:  29%|██▉       | 12059/41242 [39:11<1:35:29,  5.09it/s, training_loss=0.009]
Epoch 1:  29%|██▉       | 12059/41242 [39:11<1:35:29,  5.09it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12060/41242 [39:11<1:35:28,  5.09it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12060/41242 [39:11<1:35:28,  5.09it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12061/41242 [39:11<1:35:06,  5.11it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12061/41242 [39:11<1:35:06,  5.11it/s, training_loss=0.055]
Epoch 1:  29%|██▉       | 12062/41242 [39:11<1:35:12,  5.11it/s, training_loss=0.055]
Epoch 1:  29%|██▉       | 12062/41242 [39:11<1:35:12,  5.11it/s, training_loss=0.009]
Epoch 1:  29%|██▉       | 12063/41242 [39:11<1:34:46,  5.13it/s, training_loss=0.009]
Epoch 1:  29%|██▉       | 12063/41242 [39:12<1:34:46,  5.13it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12064/41242 [39:12<1:35:14,  5.11it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12064/41242 [39:12<1:35:14,  5.11it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12065/41242 [39:12<1:34:38,  5.14it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12065/41242 [39:12<1:34:38,  5.14it/s, training_loss=0.098]
Epoch 1:  29%|██▉       | 12066/41242 [39:12<1:33:53,  5.18it/s, training_loss=0.098]
Epoch 1:  29%|██▉       | 12066/41242 [39:12<1:33:53,  5.18it/s, training_loss=0.030]
Epoch 1:  29%|██▉       | 12067/41242 [39:12<1:34:42,  5.13it/s, training_loss=0.030]
Epoch 1:  29%|██▉       | 12067/41242 [39:12<1:34:42,  5.13it/s, training_loss=0.017]
Epoch 1:  29%|██▉       | 12068/41242 [39:12<1:35:17,  5.10it/s, training_loss=0.017]
Epoch 1:  29%|██▉       | 12068/41242 [39:13<1:35:17,  5.10it/s, training_loss=0.044]
Epoch 1:  29%|██▉       | 12069/41242 [39:13<1:35:07,  5.11it/s, training_loss=0.044]
Epoch 1:  29%|██▉       | 12069/41242 [39:13<1:35:07,  5.11it/s, training_loss=0.024]
Epoch 1:  29%|██▉       | 12070/41242 [39:13<1:35:28,  5.09it/s, training_loss=0.024]
Epoch 1:  29%|██▉       | 12070/41242 [39:13<1:35:28,  5.09it/s, training_loss=0.021]
Epoch 1:  29%|██▉       | 12071/41242 [39:13<1:35:34,  5.09it/s, training_loss=0.021]
Epoch 1:  29%|██▉       | 12071/41242 [39:13<1:35:34,  5.09it/s, training_loss=0.105]
Epoch 1:  29%|██▉       | 12072/41242 [39:13<1:35:47,  5.08it/s, training_loss=0.105]
Epoch 1:  29%|██▉       | 12072/41242 [39:13<1:35:47,  5.08it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12073/41242 [39:13<1:34:03,  5.17it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12073/41242 [39:14<1:34:03,  5.17it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12074/41242 [39:14<1:32:41,  5.24it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12074/41242 [39:14<1:32:41,  5.24it/s, training_loss=0.089]
Epoch 1:  29%|██▉       | 12075/41242 [39:14<1:32:36,  5.25it/s, training_loss=0.089]
Epoch 1:  29%|██▉       | 12075/41242 [39:14<1:32:36,  5.25it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12076/41242 [39:14<1:32:06,  5.28it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12076/41242 [39:14<1:32:06,  5.28it/s, training_loss=0.195]
Epoch 1:  29%|██▉       | 12077/41242 [39:14<1:31:57,  5.29it/s, training_loss=0.195]
Epoch 1:  29%|██▉       | 12077/41242 [39:14<1:31:57,  5.29it/s, training_loss=0.013]
Epoch 1:  29%|██▉       | 12078/41242 [39:14<1:31:52,  5.29it/s, training_loss=0.013]
Epoch 1:  29%|██▉       | 12078/41242 [39:15<1:31:52,  5.29it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12079/41242 [39:15<1:32:26,  5.26it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12079/41242 [39:15<1:32:26,  5.26it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12080/41242 [39:15<1:32:00,  5.28it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12080/41242 [39:15<1:32:00,  5.28it/s, training_loss=0.785]
Epoch 1:  29%|██▉       | 12081/41242 [39:15<1:32:08,  5.27it/s, training_loss=0.785]
Epoch 1:  29%|██▉       | 12081/41242 [39:15<1:32:08,  5.27it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12082/41242 [39:15<1:31:39,  5.30it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12082/41242 [39:15<1:31:39,  5.30it/s, training_loss=0.055]
Epoch 1:  29%|██▉       | 12083/41242 [39:15<1:32:19,  5.26it/s, training_loss=0.055]
Epoch 1:  29%|██▉       | 12083/41242 [39:16<1:32:19,  5.26it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12084/41242 [39:16<1:33:03,  5.22it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12084/41242 [39:16<1:33:03,  5.22it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12085/41242 [39:16<1:33:35,  5.19it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12085/41242 [39:16<1:33:35,  5.19it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12086/41242 [39:16<1:33:38,  5.19it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12086/41242 [39:16<1:33:38,  5.19it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12087/41242 [39:16<1:33:12,  5.21it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12087/41242 [39:16<1:33:12,  5.21it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12088/41242 [39:16<1:34:38,  5.13it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12088/41242 [39:16<1:34:38,  5.13it/s, training_loss=0.008]
Epoch 1:  29%|██▉       | 12089/41242 [39:16<1:34:36,  5.14it/s, training_loss=0.008]
Epoch 1:  29%|██▉       | 12089/41242 [39:17<1:34:36,  5.14it/s, training_loss=0.263]
Epoch 1:  29%|██▉       | 12090/41242 [39:17<1:35:37,  5.08it/s, training_loss=0.263]
Epoch 1:  29%|██▉       | 12090/41242 [39:17<1:35:37,  5.08it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12091/41242 [39:17<1:35:18,  5.10it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12091/41242 [39:17<1:35:18,  5.10it/s, training_loss=0.007]
Epoch 1:  29%|██▉       | 12092/41242 [39:17<1:39:08,  4.90it/s, training_loss=0.007]
Epoch 1:  29%|██▉       | 12092/41242 [39:17<1:39:08,  4.90it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12093/41242 [39:17<1:36:59,  5.01it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12093/41242 [39:17<1:36:59,  5.01it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12094/41242 [39:17<1:36:48,  5.02it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12094/41242 [39:18<1:36:48,  5.02it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12095/41242 [39:18<1:36:20,  5.04it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12095/41242 [39:18<1:36:20,  5.04it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12096/41242 [39:18<1:35:28,  5.09it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12096/41242 [39:18<1:35:28,  5.09it/s, training_loss=0.041]
Epoch 1:  29%|██▉       | 12097/41242 [39:18<1:34:58,  5.11it/s, training_loss=0.041]
Epoch 1:  29%|██▉       | 12097/41242 [39:18<1:34:58,  5.11it/s, training_loss=0.023]
Epoch 1:  29%|██▉       | 12098/41242 [39:18<1:34:13,  5.15it/s, training_loss=0.023]
Epoch 1:  29%|██▉       | 12098/41242 [39:18<1:34:13,  5.15it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12099/41242 [39:18<1:33:22,  5.20it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12099/41242 [39:19<1:33:22,  5.20it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12100/41242 [39:19<1:33:02,  5.22it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12100/41242 [39:19<1:33:02,  5.22it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12101/41242 [39:19<1:33:43,  5.18it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12101/41242 [39:19<1:33:43,  5.18it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12102/41242 [39:19<1:33:56,  5.17it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12102/41242 [39:19<1:33:56,  5.17it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12103/41242 [39:19<1:34:04,  5.16it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12103/41242 [39:19<1:34:04,  5.16it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12104/41242 [39:19<1:34:14,  5.15it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12104/41242 [39:20<1:34:14,  5.15it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12105/41242 [39:20<1:33:18,  5.20it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12105/41242 [39:20<1:33:18,  5.20it/s, training_loss=0.013]
Epoch 1:  29%|██▉       | 12106/41242 [39:20<1:33:10,  5.21it/s, training_loss=0.013]
Epoch 1:  29%|██▉       | 12106/41242 [39:20<1:33:10,  5.21it/s, training_loss=0.061]
Epoch 1:  29%|██▉       | 12107/41242 [39:20<1:33:07,  5.21it/s, training_loss=0.061]
Epoch 1:  29%|██▉       | 12107/41242 [39:20<1:33:07,  5.21it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12108/41242 [39:20<1:32:35,  5.24it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12108/41242 [39:20<1:32:35,  5.24it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12109/41242 [39:20<1:32:16,  5.26it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12109/41242 [39:21<1:32:16,  5.26it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12110/41242 [39:21<1:33:04,  5.22it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12110/41242 [39:21<1:33:04,  5.22it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12111/41242 [39:21<1:33:04,  5.22it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12111/41242 [39:21<1:33:04,  5.22it/s, training_loss=0.010]
Epoch 1:  29%|██▉       | 12112/41242 [39:21<1:34:15,  5.15it/s, training_loss=0.010]
Epoch 1:  29%|██▉       | 12112/41242 [39:21<1:34:15,  5.15it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12113/41242 [39:21<1:34:46,  5.12it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12113/41242 [39:21<1:34:46,  5.12it/s, training_loss=0.020]
Epoch 1:  29%|██▉       | 12114/41242 [39:21<1:34:16,  5.15it/s, training_loss=0.020]
Epoch 1:  29%|██▉       | 12114/41242 [39:22<1:34:16,  5.15it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12115/41242 [39:22<1:33:36,  5.19it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12115/41242 [39:22<1:33:36,  5.19it/s, training_loss=0.001]
Epoch 1:  29%|██▉       | 12116/41242 [39:22<1:32:35,  5.24it/s, training_loss=0.001]
Epoch 1:  29%|██▉       | 12116/41242 [39:22<1:32:35,  5.24it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12117/41242 [39:22<1:33:06,  5.21it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12117/41242 [39:22<1:33:06,  5.21it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12118/41242 [39:22<1:32:26,  5.25it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12118/41242 [39:22<1:32:26,  5.25it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12119/41242 [39:22<1:31:28,  5.31it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12119/41242 [39:22<1:31:28,  5.31it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12120/41242 [39:22<1:31:02,  5.33it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12120/41242 [39:23<1:31:02,  5.33it/s, training_loss=0.322]
Epoch 1:  29%|██▉       | 12121/41242 [39:23<1:32:38,  5.24it/s, training_loss=0.322]
Epoch 1:  29%|██▉       | 12121/41242 [39:23<1:32:38,  5.24it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12122/41242 [39:23<1:32:08,  5.27it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12122/41242 [39:23<1:32:08,  5.27it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12123/41242 [39:23<1:31:47,  5.29it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12123/41242 [39:23<1:31:47,  5.29it/s, training_loss=0.008]
Epoch 1:  29%|██▉       | 12124/41242 [39:23<1:31:09,  5.32it/s, training_loss=0.008]
Epoch 1:  29%|██▉       | 12124/41242 [39:23<1:31:09,  5.32it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12125/41242 [39:23<1:30:33,  5.36it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12125/41242 [39:24<1:30:33,  5.36it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12126/41242 [39:24<1:32:15,  5.26it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12126/41242 [39:24<1:32:15,  5.26it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12127/41242 [39:24<1:31:24,  5.31it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12127/41242 [39:24<1:31:24,  5.31it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12128/41242 [39:24<1:32:04,  5.27it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12128/41242 [39:24<1:32:04,  5.27it/s, training_loss=0.022]
Epoch 1:  29%|██▉       | 12129/41242 [39:24<1:31:58,  5.28it/s, training_loss=0.022]
Epoch 1:  29%|██▉       | 12129/41242 [39:24<1:31:58,  5.28it/s, training_loss=0.317]
Epoch 1:  29%|██▉       | 12130/41242 [39:24<1:32:34,  5.24it/s, training_loss=0.317]
Epoch 1:  29%|██▉       | 12130/41242 [39:25<1:32:34,  5.24it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12131/41242 [39:25<1:32:58,  5.22it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12131/41242 [39:25<1:32:58,  5.22it/s, training_loss=0.028]
Epoch 1:  29%|██▉       | 12132/41242 [39:25<1:34:16,  5.15it/s, training_loss=0.028]
Epoch 1:  29%|██▉       | 12132/41242 [39:25<1:34:16,  5.15it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12133/41242 [39:25<1:35:15,  5.09it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12133/41242 [39:25<1:35:15,  5.09it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12134/41242 [39:25<1:37:01,  5.00it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12134/41242 [39:25<1:37:01,  5.00it/s, training_loss=0.176]
Epoch 1:  29%|██▉       | 12135/41242 [39:25<1:37:00,  5.00it/s, training_loss=0.176]
Epoch 1:  29%|██▉       | 12135/41242 [39:26<1:37:00,  5.00it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12136/41242 [39:26<1:37:27,  4.98it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12136/41242 [39:26<1:37:27,  4.98it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12137/41242 [39:26<1:36:29,  5.03it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12137/41242 [39:26<1:36:29,  5.03it/s, training_loss=0.673]
Epoch 1:  29%|██▉       | 12138/41242 [39:26<1:37:41,  4.97it/s, training_loss=0.673]
Epoch 1:  29%|██▉       | 12138/41242 [39:26<1:37:41,  4.97it/s, training_loss=0.058]
Epoch 1:  29%|██▉       | 12139/41242 [39:26<1:35:36,  5.07it/s, training_loss=0.058]
Epoch 1:  29%|██▉       | 12139/41242 [39:26<1:35:36,  5.07it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12140/41242 [39:26<1:33:54,  5.16it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12140/41242 [39:27<1:33:54,  5.16it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12141/41242 [39:27<1:34:33,  5.13it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12141/41242 [39:27<1:34:33,  5.13it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12142/41242 [39:27<1:33:54,  5.16it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12142/41242 [39:27<1:33:54,  5.16it/s, training_loss=0.349]
Epoch 1:  29%|██▉       | 12143/41242 [39:27<1:33:48,  5.17it/s, training_loss=0.349]
Epoch 1:  29%|██▉       | 12143/41242 [39:27<1:33:48,  5.17it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12144/41242 [39:27<1:33:29,  5.19it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12144/41242 [39:27<1:33:29,  5.19it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 12145/41242 [39:27<1:34:00,  5.16it/s, training_loss=0.006]
Epoch 1:  29%|██▉       | 12145/41242 [39:28<1:34:00,  5.16it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12146/41242 [39:28<1:34:26,  5.13it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12146/41242 [39:28<1:34:26,  5.13it/s, training_loss=0.040]
Epoch 1:  29%|██▉       | 12147/41242 [39:28<1:35:58,  5.05it/s, training_loss=0.040]
Epoch 1:  29%|██▉       | 12147/41242 [39:28<1:35:58,  5.05it/s, training_loss=0.301]
Epoch 1:  29%|██▉       | 12148/41242 [39:28<1:35:46,  5.06it/s, training_loss=0.301]
Epoch 1:  29%|██▉       | 12148/41242 [39:28<1:35:46,  5.06it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12149/41242 [39:28<1:34:49,  5.11it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12149/41242 [39:28<1:34:49,  5.11it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12150/41242 [39:28<1:34:17,  5.14it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12150/41242 [39:28<1:34:17,  5.14it/s, training_loss=0.015]
Epoch 1:  29%|██▉       | 12151/41242 [39:28<1:33:54,  5.16it/s, training_loss=0.015]
Epoch 1:  29%|██▉       | 12151/41242 [39:29<1:33:54,  5.16it/s, training_loss=0.015]
Epoch 1:  29%|██▉       | 12152/41242 [39:29<1:33:26,  5.19it/s, training_loss=0.015]
Epoch 1:  29%|██▉       | 12152/41242 [39:29<1:33:26,  5.19it/s, training_loss=0.439]
Epoch 1:  29%|██▉       | 12153/41242 [39:29<1:32:54,  5.22it/s, training_loss=0.439]
Epoch 1:  29%|██▉       | 12153/41242 [39:29<1:32:54,  5.22it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12154/41242 [39:29<1:31:58,  5.27it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12154/41242 [39:29<1:31:58,  5.27it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12155/41242 [39:29<1:31:21,  5.31it/s, training_loss=0.003]
Epoch 1:  29%|██▉       | 12155/41242 [39:29<1:31:21,  5.31it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12156/41242 [39:29<1:30:41,  5.34it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12156/41242 [39:30<1:30:41,  5.34it/s, training_loss=0.011]
Epoch 1:  29%|██▉       | 12157/41242 [39:30<1:31:28,  5.30it/s, training_loss=0.011]
Epoch 1:  29%|██▉       | 12157/41242 [39:30<1:31:28,  5.30it/s, training_loss=0.093]
Epoch 1:  29%|██▉       | 12158/41242 [39:30<1:31:44,  5.28it/s, training_loss=0.093]
Epoch 1:  29%|██▉       | 12158/41242 [39:30<1:31:44,  5.28it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12159/41242 [39:30<1:31:21,  5.31it/s, training_loss=0.002]
Epoch 1:  29%|██▉       | 12159/41242 [39:30<1:31:21,  5.31it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12160/41242 [39:30<1:30:56,  5.33it/s, training_loss=0.004]
Epoch 1:  29%|██▉       | 12160/41242 [39:30<1:30:56,  5.33it/s, training_loss=0.198]
Epoch 1:  29%|██▉       | 12161/41242 [39:30<1:32:11,  5.26it/s, training_loss=0.198]
Epoch 1:  29%|██▉       | 12161/41242 [39:31<1:32:11,  5.26it/s, training_loss=0.376]
Epoch 1:  29%|██▉       | 12162/41242 [39:31<1:32:40,  5.23it/s, training_loss=0.376]
Epoch 1:  29%|██▉       | 12162/41242 [39:31<1:32:40,  5.23it/s, training_loss=0.009]
Epoch 1:  29%|██▉       | 12163/41242 [39:31<1:33:44,  5.17it/s, training_loss=0.009]
Epoch 1:  29%|██▉       | 12163/41242 [39:31<1:33:44,  5.17it/s, training_loss=0.177]
Epoch 1:  29%|██▉       | 12164/41242 [39:31<1:32:45,  5.22it/s, training_loss=0.177]
Epoch 1:  29%|██▉       | 12164/41242 [39:31<1:32:45,  5.22it/s, training_loss=0.054]
Epoch 1:  29%|██▉       | 12165/41242 [39:31<1:32:55,  5.22it/s, training_loss=0.054]
Epoch 1:  29%|██▉       | 12165/41242 [39:31<1:32:55,  5.22it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12166/41242 [39:31<1:32:04,  5.26it/s, training_loss=0.005]
Epoch 1:  29%|██▉       | 12166/41242 [39:32<1:32:04,  5.26it/s, training_loss=0.019]
Epoch 1:  30%|██▉       | 12167/41242 [39:32<1:31:42,  5.28it/s, training_loss=0.019]
Epoch 1:  30%|██▉       | 12167/41242 [39:32<1:31:42,  5.28it/s, training_loss=0.407]
Epoch 1:  30%|██▉       | 12168/41242 [39:32<1:34:36,  5.12it/s, training_loss=0.407]
Epoch 1:  30%|██▉       | 12168/41242 [39:32<1:34:36,  5.12it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12169/41242 [39:32<1:33:50,  5.16it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12169/41242 [39:32<1:33:50,  5.16it/s, training_loss=0.009]
Epoch 1:  30%|██▉       | 12170/41242 [39:32<1:33:45,  5.17it/s, training_loss=0.009]
Epoch 1:  30%|██▉       | 12170/41242 [39:32<1:33:45,  5.17it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12171/41242 [39:32<1:33:36,  5.18it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12171/41242 [39:32<1:33:36,  5.18it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12172/41242 [39:33<1:33:13,  5.20it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12172/41242 [39:33<1:33:13,  5.20it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12173/41242 [39:33<1:33:42,  5.17it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12173/41242 [39:33<1:33:42,  5.17it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12174/41242 [39:33<1:32:09,  5.26it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12174/41242 [39:33<1:32:09,  5.26it/s, training_loss=0.008]
Epoch 1:  30%|██▉       | 12175/41242 [39:33<1:31:37,  5.29it/s, training_loss=0.008]
Epoch 1:  30%|██▉       | 12175/41242 [39:33<1:31:37,  5.29it/s, training_loss=0.306]
Epoch 1:  30%|██▉       | 12176/41242 [39:33<1:31:34,  5.29it/s, training_loss=0.306]
Epoch 1:  30%|██▉       | 12176/41242 [39:33<1:31:34,  5.29it/s, training_loss=0.032]
Epoch 1:  30%|██▉       | 12177/41242 [39:33<1:33:08,  5.20it/s, training_loss=0.032]
Epoch 1:  30%|██▉       | 12177/41242 [39:34<1:33:08,  5.20it/s, training_loss=0.015]
Epoch 1:  30%|██▉       | 12178/41242 [39:34<1:32:46,  5.22it/s, training_loss=0.015]
Epoch 1:  30%|██▉       | 12178/41242 [39:34<1:32:46,  5.22it/s, training_loss=0.206]
Epoch 1:  30%|██▉       | 12179/41242 [39:34<1:33:39,  5.17it/s, training_loss=0.206]
Epoch 1:  30%|██▉       | 12179/41242 [39:34<1:33:39,  5.17it/s, training_loss=0.009]
Epoch 1:  30%|██▉       | 12180/41242 [39:34<1:33:41,  5.17it/s, training_loss=0.009]
Epoch 1:  30%|██▉       | 12180/41242 [39:34<1:33:41,  5.17it/s, training_loss=0.009]
Epoch 1:  30%|██▉       | 12181/41242 [39:34<1:33:05,  5.20it/s, training_loss=0.009]
Epoch 1:  30%|██▉       | 12181/41242 [39:34<1:33:05,  5.20it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12182/41242 [39:34<1:32:47,  5.22it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12182/41242 [39:35<1:32:47,  5.22it/s, training_loss=0.353]
Epoch 1:  30%|██▉       | 12183/41242 [39:35<1:34:00,  5.15it/s, training_loss=0.353]
Epoch 1:  30%|██▉       | 12183/41242 [39:35<1:34:00,  5.15it/s, training_loss=0.011]
Epoch 1:  30%|██▉       | 12184/41242 [39:35<1:33:35,  5.17it/s, training_loss=0.011]
Epoch 1:  30%|██▉       | 12184/41242 [39:35<1:33:35,  5.17it/s, training_loss=0.041]
Epoch 1:  30%|██▉       | 12185/41242 [39:35<1:32:45,  5.22it/s, training_loss=0.041]
Epoch 1:  30%|██▉       | 12185/41242 [39:35<1:32:45,  5.22it/s, training_loss=1.081]
Epoch 1:  30%|██▉       | 12186/41242 [39:35<1:32:12,  5.25it/s, training_loss=1.081]
Epoch 1:  30%|██▉       | 12186/41242 [39:35<1:32:12,  5.25it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12187/41242 [39:35<1:31:23,  5.30it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12187/41242 [39:36<1:31:23,  5.30it/s, training_loss=0.006]
Epoch 1:  30%|██▉       | 12188/41242 [39:36<1:31:31,  5.29it/s, training_loss=0.006]
Epoch 1:  30%|██▉       | 12188/41242 [39:36<1:31:31,  5.29it/s, training_loss=0.015]
Epoch 1:  30%|██▉       | 12189/41242 [39:36<1:33:18,  5.19it/s, training_loss=0.015]
Epoch 1:  30%|██▉       | 12189/41242 [39:36<1:33:18,  5.19it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12190/41242 [39:36<1:34:05,  5.15it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12190/41242 [39:36<1:34:05,  5.15it/s, training_loss=0.043]
Epoch 1:  30%|██▉       | 12191/41242 [39:36<1:34:26,  5.13it/s, training_loss=0.043]
Epoch 1:  30%|██▉       | 12191/41242 [39:36<1:34:26,  5.13it/s, training_loss=0.001]
Epoch 1:  30%|██▉       | 12192/41242 [39:36<1:33:57,  5.15it/s, training_loss=0.001]
Epoch 1:  30%|██▉       | 12192/41242 [39:37<1:33:57,  5.15it/s, training_loss=0.152]
Epoch 1:  30%|██▉       | 12193/41242 [39:37<1:33:53,  5.16it/s, training_loss=0.152]
Epoch 1:  30%|██▉       | 12193/41242 [39:37<1:33:53,  5.16it/s, training_loss=0.334]
Epoch 1:  30%|██▉       | 12194/41242 [39:37<1:34:52,  5.10it/s, training_loss=0.334]
Epoch 1:  30%|██▉       | 12194/41242 [39:37<1:34:52,  5.10it/s, training_loss=0.001]
Epoch 1:  30%|██▉       | 12195/41242 [39:37<1:32:55,  5.21it/s, training_loss=0.001]
Epoch 1:  30%|██▉       | 12195/41242 [39:37<1:32:55,  5.21it/s, training_loss=0.013]
Epoch 1:  30%|██▉       | 12196/41242 [39:37<1:32:13,  5.25it/s, training_loss=0.013]
Epoch 1:  30%|██▉       | 12196/41242 [39:37<1:32:13,  5.25it/s, training_loss=0.025]
Epoch 1:  30%|██▉       | 12197/41242 [39:37<1:32:00,  5.26it/s, training_loss=0.025]
Epoch 1:  30%|██▉       | 12197/41242 [39:37<1:32:00,  5.26it/s, training_loss=0.023]
Epoch 1:  30%|██▉       | 12198/41242 [39:38<1:34:19,  5.13it/s, training_loss=0.023]
Epoch 1:  30%|██▉       | 12198/41242 [39:38<1:34:19,  5.13it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12199/41242 [39:38<1:35:36,  5.06it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12199/41242 [39:38<1:35:36,  5.06it/s, training_loss=0.177]
Epoch 1:  30%|██▉       | 12200/41242 [39:38<1:35:30,  5.07it/s, training_loss=0.177]
Epoch 1:  30%|██▉       | 12200/41242 [39:38<1:35:30,  5.07it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12201/41242 [39:38<1:35:36,  5.06it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12201/41242 [39:38<1:35:36,  5.06it/s, training_loss=0.032]
Epoch 1:  30%|██▉       | 12202/41242 [39:38<1:36:07,  5.04it/s, training_loss=0.032]
Epoch 1:  30%|██▉       | 12202/41242 [39:38<1:36:07,  5.04it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12203/41242 [39:39<1:36:19,  5.02it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12203/41242 [39:39<1:36:19,  5.02it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12204/41242 [39:39<1:34:34,  5.12it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12204/41242 [39:39<1:34:34,  5.12it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12205/41242 [39:39<1:33:31,  5.17it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12205/41242 [39:39<1:33:31,  5.17it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12206/41242 [39:39<1:32:30,  5.23it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12206/41242 [39:39<1:32:30,  5.23it/s, training_loss=0.007]
Epoch 1:  30%|██▉       | 12207/41242 [39:39<1:32:14,  5.25it/s, training_loss=0.007]
Epoch 1:  30%|██▉       | 12207/41242 [39:39<1:32:14,  5.25it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12208/41242 [39:39<1:31:10,  5.31it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12208/41242 [39:40<1:31:10,  5.31it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12209/41242 [39:40<1:30:31,  5.35it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12209/41242 [39:40<1:30:31,  5.35it/s, training_loss=0.006]
Epoch 1:  30%|██▉       | 12210/41242 [39:40<1:29:44,  5.39it/s, training_loss=0.006]
Epoch 1:  30%|██▉       | 12210/41242 [39:40<1:29:44,  5.39it/s, training_loss=0.025]
Epoch 1:  30%|██▉       | 12211/41242 [39:40<1:30:07,  5.37it/s, training_loss=0.025]
Epoch 1:  30%|██▉       | 12211/41242 [39:40<1:30:07,  5.37it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12212/41242 [39:40<1:30:17,  5.36it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12212/41242 [39:40<1:30:17,  5.36it/s, training_loss=0.941]
Epoch 1:  30%|██▉       | 12213/41242 [39:40<1:31:57,  5.26it/s, training_loss=0.941]
Epoch 1:  30%|██▉       | 12213/41242 [39:41<1:31:57,  5.26it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12214/41242 [39:41<1:32:27,  5.23it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12214/41242 [39:41<1:32:27,  5.23it/s, training_loss=0.135]
Epoch 1:  30%|██▉       | 12215/41242 [39:41<1:31:34,  5.28it/s, training_loss=0.135]
Epoch 1:  30%|██▉       | 12215/41242 [39:41<1:31:34,  5.28it/s, training_loss=0.007]
Epoch 1:  30%|██▉       | 12216/41242 [39:41<1:31:43,  5.27it/s, training_loss=0.007]
Epoch 1:  30%|██▉       | 12216/41242 [39:41<1:31:43,  5.27it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12217/41242 [39:41<1:31:52,  5.27it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12217/41242 [39:41<1:31:52,  5.27it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12218/41242 [39:41<1:32:06,  5.25it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12218/41242 [39:42<1:32:06,  5.25it/s, training_loss=0.021]
Epoch 1:  30%|██▉       | 12219/41242 [39:42<1:33:22,  5.18it/s, training_loss=0.021]
Epoch 1:  30%|██▉       | 12219/41242 [39:42<1:33:22,  5.18it/s, training_loss=0.013]
Epoch 1:  30%|██▉       | 12220/41242 [39:42<1:34:07,  5.14it/s, training_loss=0.013]
Epoch 1:  30%|██▉       | 12220/41242 [39:42<1:34:07,  5.14it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12221/41242 [39:42<1:32:49,  5.21it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12221/41242 [39:42<1:32:49,  5.21it/s, training_loss=0.007]
Epoch 1:  30%|██▉       | 12222/41242 [39:42<1:34:20,  5.13it/s, training_loss=0.007]
Epoch 1:  30%|██▉       | 12222/41242 [39:42<1:34:20,  5.13it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12223/41242 [39:42<1:33:59,  5.15it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12223/41242 [39:42<1:33:59,  5.15it/s, training_loss=0.009]
Epoch 1:  30%|██▉       | 12224/41242 [39:43<1:34:08,  5.14it/s, training_loss=0.009]
Epoch 1:  30%|██▉       | 12224/41242 [39:43<1:34:08,  5.14it/s, training_loss=1.061]
Epoch 1:  30%|██▉       | 12225/41242 [39:43<1:35:06,  5.08it/s, training_loss=1.061]
Epoch 1:  30%|██▉       | 12225/41242 [39:43<1:35:06,  5.08it/s, training_loss=0.012]
Epoch 1:  30%|██▉       | 12226/41242 [39:43<1:35:42,  5.05it/s, training_loss=0.012]
Epoch 1:  30%|██▉       | 12226/41242 [39:43<1:35:42,  5.05it/s, training_loss=0.415]
Epoch 1:  30%|██▉       | 12227/41242 [39:43<1:34:50,  5.10it/s, training_loss=0.415]
Epoch 1:  30%|██▉       | 12227/41242 [39:43<1:34:50,  5.10it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12228/41242 [39:43<1:33:26,  5.18it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12228/41242 [39:43<1:33:26,  5.18it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12229/41242 [39:43<1:33:02,  5.20it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12229/41242 [39:44<1:33:02,  5.20it/s, training_loss=0.160]
Epoch 1:  30%|██▉       | 12230/41242 [39:44<1:33:40,  5.16it/s, training_loss=0.160]
Epoch 1:  30%|██▉       | 12230/41242 [39:44<1:33:40,  5.16it/s, training_loss=0.024]
Epoch 1:  30%|██▉       | 12231/41242 [39:44<1:33:38,  5.16it/s, training_loss=0.024]
Epoch 1:  30%|██▉       | 12231/41242 [39:44<1:33:38,  5.16it/s, training_loss=0.095]
Epoch 1:  30%|██▉       | 12232/41242 [39:44<1:32:50,  5.21it/s, training_loss=0.095]
Epoch 1:  30%|██▉       | 12232/41242 [39:44<1:32:50,  5.21it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12233/41242 [39:44<1:32:04,  5.25it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12233/41242 [39:44<1:32:04,  5.25it/s, training_loss=0.010]
Epoch 1:  30%|██▉       | 12234/41242 [39:44<1:32:11,  5.24it/s, training_loss=0.010]
Epoch 1:  30%|██▉       | 12234/41242 [39:45<1:32:11,  5.24it/s, training_loss=0.201]
Epoch 1:  30%|██▉       | 12235/41242 [39:45<1:31:13,  5.30it/s, training_loss=0.201]
Epoch 1:  30%|██▉       | 12235/41242 [39:45<1:31:13,  5.30it/s, training_loss=0.299]
Epoch 1:  30%|██▉       | 12236/41242 [39:45<1:31:24,  5.29it/s, training_loss=0.299]
Epoch 1:  30%|██▉       | 12236/41242 [39:45<1:31:24,  5.29it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12237/41242 [39:45<1:32:05,  5.25it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12237/41242 [39:45<1:32:05,  5.25it/s, training_loss=0.120]
Epoch 1:  30%|██▉       | 12238/41242 [39:45<1:33:23,  5.18it/s, training_loss=0.120]
Epoch 1:  30%|██▉       | 12238/41242 [39:45<1:33:23,  5.18it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12239/41242 [39:45<1:32:24,  5.23it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12239/41242 [39:46<1:32:24,  5.23it/s, training_loss=0.028]
Epoch 1:  30%|██▉       | 12240/41242 [39:46<1:32:15,  5.24it/s, training_loss=0.028]
Epoch 1:  30%|██▉       | 12240/41242 [39:46<1:32:15,  5.24it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12241/41242 [39:46<1:33:03,  5.19it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12241/41242 [39:46<1:33:03,  5.19it/s, training_loss=0.044]
Epoch 1:  30%|██▉       | 12242/41242 [39:46<1:32:22,  5.23it/s, training_loss=0.044]
Epoch 1:  30%|██▉       | 12242/41242 [39:46<1:32:22,  5.23it/s, training_loss=0.252]
Epoch 1:  30%|██▉       | 12243/41242 [39:46<1:32:00,  5.25it/s, training_loss=0.252]
Epoch 1:  30%|██▉       | 12243/41242 [39:46<1:32:00,  5.25it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12244/41242 [39:46<1:32:11,  5.24it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12244/41242 [39:47<1:32:11,  5.24it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12245/41242 [39:47<1:32:55,  5.20it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12245/41242 [39:47<1:32:55,  5.20it/s, training_loss=0.259]
Epoch 1:  30%|██▉       | 12246/41242 [39:47<1:33:48,  5.15it/s, training_loss=0.259]
Epoch 1:  30%|██▉       | 12246/41242 [39:47<1:33:48,  5.15it/s, training_loss=0.225]
Epoch 1:  30%|██▉       | 12247/41242 [39:47<1:33:39,  5.16it/s, training_loss=0.225]
Epoch 1:  30%|██▉       | 12247/41242 [39:47<1:33:39,  5.16it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12248/41242 [39:47<1:32:10,  5.24it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12248/41242 [39:47<1:32:10,  5.24it/s, training_loss=0.063]
Epoch 1:  30%|██▉       | 12249/41242 [39:47<1:31:59,  5.25it/s, training_loss=0.063]
Epoch 1:  30%|██▉       | 12249/41242 [39:47<1:31:59,  5.25it/s, training_loss=0.001]
Epoch 1:  30%|██▉       | 12250/41242 [39:47<1:31:33,  5.28it/s, training_loss=0.001]
Epoch 1:  30%|██▉       | 12250/41242 [39:48<1:31:33,  5.28it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12251/41242 [39:48<1:33:12,  5.18it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12251/41242 [39:48<1:33:12,  5.18it/s, training_loss=0.503]
Epoch 1:  30%|██▉       | 12252/41242 [39:48<1:33:36,  5.16it/s, training_loss=0.503]
Epoch 1:  30%|██▉       | 12252/41242 [39:48<1:33:36,  5.16it/s, training_loss=0.001]
Epoch 1:  30%|██▉       | 12253/41242 [39:48<1:33:22,  5.17it/s, training_loss=0.001]
Epoch 1:  30%|██▉       | 12253/41242 [39:48<1:33:22,  5.17it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12254/41242 [39:48<1:32:31,  5.22it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12254/41242 [39:48<1:32:31,  5.22it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12255/41242 [39:48<1:31:47,  5.26it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12255/41242 [39:49<1:31:47,  5.26it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12256/41242 [39:49<1:30:53,  5.32it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12256/41242 [39:49<1:30:53,  5.32it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12257/41242 [39:49<1:31:11,  5.30it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12257/41242 [39:49<1:31:11,  5.30it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12258/41242 [39:49<1:32:14,  5.24it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12258/41242 [39:49<1:32:14,  5.24it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12259/41242 [39:49<1:33:22,  5.17it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12259/41242 [39:49<1:33:22,  5.17it/s, training_loss=0.012]
Epoch 1:  30%|██▉       | 12260/41242 [39:49<1:33:14,  5.18it/s, training_loss=0.012]
Epoch 1:  30%|██▉       | 12260/41242 [39:50<1:33:14,  5.18it/s, training_loss=0.009]
Epoch 1:  30%|██▉       | 12261/41242 [39:50<1:33:58,  5.14it/s, training_loss=0.009]
Epoch 1:  30%|██▉       | 12261/41242 [39:50<1:33:58,  5.14it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12262/41242 [39:50<1:33:29,  5.17it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12262/41242 [39:50<1:33:29,  5.17it/s, training_loss=0.183]
Epoch 1:  30%|██▉       | 12263/41242 [39:50<1:32:57,  5.20it/s, training_loss=0.183]
Epoch 1:  30%|██▉       | 12263/41242 [39:50<1:32:57,  5.20it/s, training_loss=0.059]
Epoch 1:  30%|██▉       | 12264/41242 [39:50<1:32:44,  5.21it/s, training_loss=0.059]
Epoch 1:  30%|██▉       | 12264/41242 [39:50<1:32:44,  5.21it/s, training_loss=0.810]
Epoch 1:  30%|██▉       | 12265/41242 [39:50<1:32:28,  5.22it/s, training_loss=0.810]
Epoch 1:  30%|██▉       | 12265/41242 [39:51<1:32:28,  5.22it/s, training_loss=0.007]
Epoch 1:  30%|██▉       | 12266/41242 [39:51<1:31:24,  5.28it/s, training_loss=0.007]
Epoch 1:  30%|██▉       | 12266/41242 [39:51<1:31:24,  5.28it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12267/41242 [39:51<1:32:48,  5.20it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12267/41242 [39:51<1:32:48,  5.20it/s, training_loss=0.424]
Epoch 1:  30%|██▉       | 12268/41242 [39:51<1:34:48,  5.09it/s, training_loss=0.424]
Epoch 1:  30%|██▉       | 12268/41242 [39:51<1:34:48,  5.09it/s, training_loss=0.139]
Epoch 1:  30%|██▉       | 12269/41242 [39:51<1:35:22,  5.06it/s, training_loss=0.139]
Epoch 1:  30%|██▉       | 12269/41242 [39:51<1:35:22,  5.06it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12270/41242 [39:51<1:33:43,  5.15it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12270/41242 [39:52<1:33:43,  5.15it/s, training_loss=0.019]
Epoch 1:  30%|██▉       | 12271/41242 [39:52<1:35:12,  5.07it/s, training_loss=0.019]
Epoch 1:  30%|██▉       | 12271/41242 [39:52<1:35:12,  5.07it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12272/41242 [39:52<1:34:55,  5.09it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12272/41242 [39:52<1:34:55,  5.09it/s, training_loss=0.015]
Epoch 1:  30%|██▉       | 12273/41242 [39:52<1:35:15,  5.07it/s, training_loss=0.015]
Epoch 1:  30%|██▉       | 12273/41242 [39:52<1:35:15,  5.07it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12274/41242 [39:52<1:36:11,  5.02it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12274/41242 [39:52<1:36:11,  5.02it/s, training_loss=0.342]
Epoch 1:  30%|██▉       | 12275/41242 [39:52<1:35:09,  5.07it/s, training_loss=0.342]
Epoch 1:  30%|██▉       | 12275/41242 [39:53<1:35:09,  5.07it/s, training_loss=0.021]
Epoch 1:  30%|██▉       | 12276/41242 [39:53<1:35:36,  5.05it/s, training_loss=0.021]
Epoch 1:  30%|██▉       | 12276/41242 [39:53<1:35:36,  5.05it/s, training_loss=0.578]
Epoch 1:  30%|██▉       | 12277/41242 [39:53<1:36:15,  5.02it/s, training_loss=0.578]
Epoch 1:  30%|██▉       | 12277/41242 [39:53<1:36:15,  5.02it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12278/41242 [39:53<1:34:43,  5.10it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12278/41242 [39:53<1:34:43,  5.10it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12279/41242 [39:53<1:33:52,  5.14it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12279/41242 [39:53<1:33:52,  5.14it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12280/41242 [39:53<1:32:26,  5.22it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12280/41242 [39:53<1:32:26,  5.22it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12281/41242 [39:53<1:31:10,  5.29it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12281/41242 [39:54<1:31:10,  5.29it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12282/41242 [39:54<1:30:28,  5.33it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12282/41242 [39:54<1:30:28,  5.33it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12283/41242 [39:54<1:29:27,  5.40it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12283/41242 [39:54<1:29:27,  5.40it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12284/41242 [39:54<1:29:07,  5.42it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12284/41242 [39:54<1:29:07,  5.42it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12285/41242 [39:54<1:28:54,  5.43it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12285/41242 [39:54<1:28:54,  5.43it/s, training_loss=0.008]
Epoch 1:  30%|██▉       | 12286/41242 [39:54<1:29:17,  5.40it/s, training_loss=0.008]
Epoch 1:  30%|██▉       | 12286/41242 [39:55<1:29:17,  5.40it/s, training_loss=0.001]
Epoch 1:  30%|██▉       | 12287/41242 [39:55<1:29:00,  5.42it/s, training_loss=0.001]
Epoch 1:  30%|██▉       | 12287/41242 [39:55<1:29:00,  5.42it/s, training_loss=0.013]
Epoch 1:  30%|██▉       | 12288/41242 [39:55<1:29:37,  5.38it/s, training_loss=0.013]
Epoch 1:  30%|██▉       | 12288/41242 [39:55<1:29:37,  5.38it/s, training_loss=0.006]
Epoch 1:  30%|██▉       | 12289/41242 [39:55<1:29:57,  5.36it/s, training_loss=0.006]
Epoch 1:  30%|██▉       | 12289/41242 [39:55<1:29:57,  5.36it/s, training_loss=0.014]
Epoch 1:  30%|██▉       | 12290/41242 [39:55<1:31:40,  5.26it/s, training_loss=0.014]
Epoch 1:  30%|██▉       | 12290/41242 [39:55<1:31:40,  5.26it/s, training_loss=0.027]
Epoch 1:  30%|██▉       | 12291/41242 [39:55<1:32:18,  5.23it/s, training_loss=0.027]
Epoch 1:  30%|██▉       | 12291/41242 [39:56<1:32:18,  5.23it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12292/41242 [39:56<1:31:34,  5.27it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12292/41242 [39:56<1:31:34,  5.27it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12293/41242 [39:56<1:31:07,  5.29it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12293/41242 [39:56<1:31:07,  5.29it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12294/41242 [39:56<1:30:40,  5.32it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12294/41242 [39:56<1:30:40,  5.32it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12295/41242 [39:56<1:30:24,  5.34it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12295/41242 [39:56<1:30:24,  5.34it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12296/41242 [39:56<1:30:44,  5.32it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12296/41242 [39:56<1:30:44,  5.32it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12297/41242 [39:56<1:30:58,  5.30it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12297/41242 [39:57<1:30:58,  5.30it/s, training_loss=0.213]
Epoch 1:  30%|██▉       | 12298/41242 [39:57<1:31:08,  5.29it/s, training_loss=0.213]
Epoch 1:  30%|██▉       | 12298/41242 [39:57<1:31:08,  5.29it/s, training_loss=0.007]
Epoch 1:  30%|██▉       | 12299/41242 [39:57<1:31:09,  5.29it/s, training_loss=0.007]
Epoch 1:  30%|██▉       | 12299/41242 [39:57<1:31:09,  5.29it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12300/41242 [39:57<1:30:38,  5.32it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12300/41242 [39:57<1:30:38,  5.32it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12301/41242 [39:57<1:30:19,  5.34it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12301/41242 [39:57<1:30:19,  5.34it/s, training_loss=0.019]
Epoch 1:  30%|██▉       | 12302/41242 [39:57<1:30:51,  5.31it/s, training_loss=0.019]
Epoch 1:  30%|██▉       | 12302/41242 [39:58<1:30:51,  5.31it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12303/41242 [39:58<1:33:10,  5.18it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12303/41242 [39:58<1:33:10,  5.18it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12304/41242 [39:58<1:32:59,  5.19it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12304/41242 [39:58<1:32:59,  5.19it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12305/41242 [39:58<1:32:33,  5.21it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12305/41242 [39:58<1:32:33,  5.21it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12306/41242 [39:58<1:32:04,  5.24it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12306/41242 [39:58<1:32:04,  5.24it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12307/41242 [39:58<1:31:41,  5.26it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12307/41242 [39:59<1:31:41,  5.26it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12308/41242 [39:59<1:31:01,  5.30it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12308/41242 [39:59<1:31:01,  5.30it/s, training_loss=0.118]
Epoch 1:  30%|██▉       | 12309/41242 [39:59<1:31:22,  5.28it/s, training_loss=0.118]
Epoch 1:  30%|██▉       | 12309/41242 [39:59<1:31:22,  5.28it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12310/41242 [39:59<1:31:49,  5.25it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12310/41242 [39:59<1:31:49,  5.25it/s, training_loss=0.025]
Epoch 1:  30%|██▉       | 12311/41242 [39:59<1:33:20,  5.17it/s, training_loss=0.025]
Epoch 1:  30%|██▉       | 12311/41242 [39:59<1:33:20,  5.17it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12312/41242 [39:59<1:32:33,  5.21it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12312/41242 [40:00<1:32:33,  5.21it/s, training_loss=0.011]
Epoch 1:  30%|██▉       | 12313/41242 [40:00<1:31:37,  5.26it/s, training_loss=0.011]
Epoch 1:  30%|██▉       | 12313/41242 [40:00<1:31:37,  5.26it/s, training_loss=0.015]
Epoch 1:  30%|██▉       | 12314/41242 [40:00<1:31:00,  5.30it/s, training_loss=0.015]
Epoch 1:  30%|██▉       | 12314/41242 [40:00<1:31:00,  5.30it/s, training_loss=0.211]
Epoch 1:  30%|██▉       | 12315/41242 [40:00<1:31:14,  5.28it/s, training_loss=0.211]
Epoch 1:  30%|██▉       | 12315/41242 [40:00<1:31:14,  5.28it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12316/41242 [40:00<1:30:59,  5.30it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12316/41242 [40:00<1:30:59,  5.30it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12317/41242 [40:00<1:30:49,  5.31it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12317/41242 [40:00<1:30:49,  5.31it/s, training_loss=0.076]
Epoch 1:  30%|██▉       | 12318/41242 [40:00<1:33:39,  5.15it/s, training_loss=0.076]
Epoch 1:  30%|██▉       | 12318/41242 [40:01<1:33:39,  5.15it/s, training_loss=0.042]
Epoch 1:  30%|██▉       | 12319/41242 [40:01<1:33:42,  5.14it/s, training_loss=0.042]
Epoch 1:  30%|██▉       | 12319/41242 [40:01<1:33:42,  5.14it/s, training_loss=0.140]
Epoch 1:  30%|██▉       | 12320/41242 [40:01<1:33:51,  5.14it/s, training_loss=0.140]
Epoch 1:  30%|██▉       | 12320/41242 [40:01<1:33:51,  5.14it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12321/41242 [40:01<1:33:31,  5.15it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12321/41242 [40:01<1:33:31,  5.15it/s, training_loss=0.045]
Epoch 1:  30%|██▉       | 12322/41242 [40:01<1:36:16,  5.01it/s, training_loss=0.045]
Epoch 1:  30%|██▉       | 12322/41242 [40:01<1:36:16,  5.01it/s, training_loss=0.013]
Epoch 1:  30%|██▉       | 12323/41242 [40:01<1:35:07,  5.07it/s, training_loss=0.013]
Epoch 1:  30%|██▉       | 12323/41242 [40:02<1:35:07,  5.07it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12324/41242 [40:02<1:34:55,  5.08it/s, training_loss=0.005]
Epoch 1:  30%|██▉       | 12324/41242 [40:02<1:34:55,  5.08it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12325/41242 [40:02<1:33:17,  5.17it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12325/41242 [40:02<1:33:17,  5.17it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12326/41242 [40:02<1:32:10,  5.23it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12326/41242 [40:02<1:32:10,  5.23it/s, training_loss=0.107]
Epoch 1:  30%|██▉       | 12327/41242 [40:02<1:32:15,  5.22it/s, training_loss=0.107]
Epoch 1:  30%|██▉       | 12327/41242 [40:02<1:32:15,  5.22it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12328/41242 [40:02<1:31:25,  5.27it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12328/41242 [40:03<1:31:25,  5.27it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12329/41242 [40:03<1:31:52,  5.25it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12329/41242 [40:03<1:31:52,  5.25it/s, training_loss=0.009]
Epoch 1:  30%|██▉       | 12330/41242 [40:03<1:32:57,  5.18it/s, training_loss=0.009]
Epoch 1:  30%|██▉       | 12330/41242 [40:03<1:32:57,  5.18it/s, training_loss=0.001]
Epoch 1:  30%|██▉       | 12331/41242 [40:03<1:31:45,  5.25it/s, training_loss=0.001]
Epoch 1:  30%|██▉       | 12331/41242 [40:03<1:31:45,  5.25it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12332/41242 [40:03<1:30:46,  5.31it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12332/41242 [40:03<1:30:46,  5.31it/s, training_loss=0.090]
Epoch 1:  30%|██▉       | 12333/41242 [40:03<1:32:52,  5.19it/s, training_loss=0.090]
Epoch 1:  30%|██▉       | 12333/41242 [40:04<1:32:52,  5.19it/s, training_loss=0.007]
Epoch 1:  30%|██▉       | 12334/41242 [40:04<1:32:21,  5.22it/s, training_loss=0.007]
Epoch 1:  30%|██▉       | 12334/41242 [40:04<1:32:21,  5.22it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12335/41242 [40:04<1:31:54,  5.24it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12335/41242 [40:04<1:31:54,  5.24it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12336/41242 [40:04<1:31:06,  5.29it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12336/41242 [40:04<1:31:06,  5.29it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12337/41242 [40:04<1:33:16,  5.16it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12337/41242 [40:04<1:33:16,  5.16it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12338/41242 [40:04<1:32:59,  5.18it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12338/41242 [40:05<1:32:59,  5.18it/s, training_loss=0.049]
Epoch 1:  30%|██▉       | 12339/41242 [40:05<1:32:29,  5.21it/s, training_loss=0.049]
Epoch 1:  30%|██▉       | 12339/41242 [40:05<1:32:29,  5.21it/s, training_loss=0.001]
Epoch 1:  30%|██▉       | 12340/41242 [40:05<1:31:37,  5.26it/s, training_loss=0.001]
Epoch 1:  30%|██▉       | 12340/41242 [40:05<1:31:37,  5.26it/s, training_loss=0.420]
Epoch 1:  30%|██▉       | 12341/41242 [40:05<1:31:21,  5.27it/s, training_loss=0.420]
Epoch 1:  30%|██▉       | 12341/41242 [40:05<1:31:21,  5.27it/s, training_loss=0.019]
Epoch 1:  30%|██▉       | 12342/41242 [40:05<1:32:24,  5.21it/s, training_loss=0.019]
Epoch 1:  30%|██▉       | 12342/41242 [40:05<1:32:24,  5.21it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12343/41242 [40:05<1:31:57,  5.24it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12343/41242 [40:05<1:31:57,  5.24it/s, training_loss=0.383]
Epoch 1:  30%|██▉       | 12344/41242 [40:05<1:33:00,  5.18it/s, training_loss=0.383]
Epoch 1:  30%|██▉       | 12344/41242 [40:06<1:33:00,  5.18it/s, training_loss=0.205]
Epoch 1:  30%|██▉       | 12345/41242 [40:06<1:32:25,  5.21it/s, training_loss=0.205]
Epoch 1:  30%|██▉       | 12345/41242 [40:06<1:32:25,  5.21it/s, training_loss=0.010]
Epoch 1:  30%|██▉       | 12346/41242 [40:06<1:32:07,  5.23it/s, training_loss=0.010]
Epoch 1:  30%|██▉       | 12346/41242 [40:06<1:32:07,  5.23it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12347/41242 [40:06<1:32:18,  5.22it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12347/41242 [40:06<1:32:18,  5.22it/s, training_loss=0.009]
Epoch 1:  30%|██▉       | 12348/41242 [40:06<1:33:44,  5.14it/s, training_loss=0.009]
Epoch 1:  30%|██▉       | 12348/41242 [40:06<1:33:44,  5.14it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12349/41242 [40:06<1:33:20,  5.16it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12349/41242 [40:07<1:33:20,  5.16it/s, training_loss=0.008]
Epoch 1:  30%|██▉       | 12350/41242 [40:07<1:32:49,  5.19it/s, training_loss=0.008]
Epoch 1:  30%|██▉       | 12350/41242 [40:07<1:32:49,  5.19it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12351/41242 [40:07<1:31:20,  5.27it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12351/41242 [40:07<1:31:20,  5.27it/s, training_loss=0.006]
Epoch 1:  30%|██▉       | 12352/41242 [40:07<1:31:09,  5.28it/s, training_loss=0.006]
Epoch 1:  30%|██▉       | 12352/41242 [40:07<1:31:09,  5.28it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12353/41242 [40:07<1:31:11,  5.28it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12353/41242 [40:07<1:31:11,  5.28it/s, training_loss=0.042]
Epoch 1:  30%|██▉       | 12354/41242 [40:07<1:34:21,  5.10it/s, training_loss=0.042]
Epoch 1:  30%|██▉       | 12354/41242 [40:08<1:34:21,  5.10it/s, training_loss=0.014]
Epoch 1:  30%|██▉       | 12355/41242 [40:08<1:35:13,  5.06it/s, training_loss=0.014]
Epoch 1:  30%|██▉       | 12355/41242 [40:08<1:35:13,  5.06it/s, training_loss=0.001]
Epoch 1:  30%|██▉       | 12356/41242 [40:08<1:34:04,  5.12it/s, training_loss=0.001]
Epoch 1:  30%|██▉       | 12356/41242 [40:08<1:34:04,  5.12it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12357/41242 [40:08<1:33:39,  5.14it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12357/41242 [40:08<1:33:39,  5.14it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12358/41242 [40:08<1:32:15,  5.22it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12358/41242 [40:08<1:32:15,  5.22it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12359/41242 [40:08<1:31:22,  5.27it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12359/41242 [40:09<1:31:22,  5.27it/s, training_loss=0.463]
Epoch 1:  30%|██▉       | 12360/41242 [40:09<1:31:36,  5.25it/s, training_loss=0.463]
Epoch 1:  30%|██▉       | 12360/41242 [40:09<1:31:36,  5.25it/s, training_loss=0.569]
Epoch 1:  30%|██▉       | 12361/41242 [40:09<1:31:37,  5.25it/s, training_loss=0.569]
Epoch 1:  30%|██▉       | 12361/41242 [40:09<1:31:37,  5.25it/s, training_loss=0.168]
Epoch 1:  30%|██▉       | 12362/41242 [40:09<1:31:58,  5.23it/s, training_loss=0.168]
Epoch 1:  30%|██▉       | 12362/41242 [40:09<1:31:58,  5.23it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12363/41242 [40:09<1:32:08,  5.22it/s, training_loss=0.003]
Epoch 1:  30%|██▉       | 12363/41242 [40:09<1:32:08,  5.22it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12364/41242 [40:09<1:31:49,  5.24it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12364/41242 [40:10<1:31:49,  5.24it/s, training_loss=0.486]
Epoch 1:  30%|██▉       | 12365/41242 [40:10<1:32:29,  5.20it/s, training_loss=0.486]
Epoch 1:  30%|██▉       | 12365/41242 [40:10<1:32:29,  5.20it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12366/41242 [40:10<1:31:01,  5.29it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12366/41242 [40:10<1:31:01,  5.29it/s, training_loss=0.035]
Epoch 1:  30%|██▉       | 12367/41242 [40:10<1:30:47,  5.30it/s, training_loss=0.035]
Epoch 1:  30%|██▉       | 12367/41242 [40:10<1:30:47,  5.30it/s, training_loss=0.024]
Epoch 1:  30%|██▉       | 12368/41242 [40:10<1:32:01,  5.23it/s, training_loss=0.024]
Epoch 1:  30%|██▉       | 12368/41242 [40:10<1:32:01,  5.23it/s, training_loss=0.021]
Epoch 1:  30%|██▉       | 12369/41242 [40:10<1:33:44,  5.13it/s, training_loss=0.021]
Epoch 1:  30%|██▉       | 12369/41242 [40:10<1:33:44,  5.13it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12370/41242 [40:10<1:33:08,  5.17it/s, training_loss=0.002]
Epoch 1:  30%|██▉       | 12370/41242 [40:11<1:33:08,  5.17it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12371/41242 [40:11<1:32:40,  5.19it/s, training_loss=0.004]
Epoch 1:  30%|██▉       | 12371/41242 [40:11<1:32:40,  5.19it/s, training_loss=0.902]
Epoch 1:  30%|██▉       | 12372/41242 [40:11<1:33:54,  5.12it/s, training_loss=0.902]
Epoch 1:  30%|██▉       | 12372/41242 [40:11<1:33:54,  5.12it/s, training_loss=0.711]
Epoch 1:  30%|███       | 12373/41242 [40:11<1:35:21,  5.05it/s, training_loss=0.711]
Epoch 1:  30%|███       | 12373/41242 [40:11<1:35:21,  5.05it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12374/41242 [40:11<1:33:58,  5.12it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12374/41242 [40:11<1:33:58,  5.12it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12375/41242 [40:11<1:32:49,  5.18it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12375/41242 [40:12<1:32:49,  5.18it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12376/41242 [40:12<1:31:31,  5.26it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12376/41242 [40:12<1:31:31,  5.26it/s, training_loss=0.009]
Epoch 1:  30%|███       | 12377/41242 [40:12<1:32:59,  5.17it/s, training_loss=0.009]
Epoch 1:  30%|███       | 12377/41242 [40:12<1:32:59,  5.17it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12378/41242 [40:12<1:31:52,  5.24it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12378/41242 [40:12<1:31:52,  5.24it/s, training_loss=0.311]
Epoch 1:  30%|███       | 12379/41242 [40:12<1:31:49,  5.24it/s, training_loss=0.311]
Epoch 1:  30%|███       | 12379/41242 [40:12<1:31:49,  5.24it/s, training_loss=0.174]
Epoch 1:  30%|███       | 12380/41242 [40:12<1:33:21,  5.15it/s, training_loss=0.174]
Epoch 1:  30%|███       | 12380/41242 [40:13<1:33:21,  5.15it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12381/41242 [40:13<1:33:47,  5.13it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12381/41242 [40:13<1:33:47,  5.13it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12382/41242 [40:13<1:33:49,  5.13it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12382/41242 [40:13<1:33:49,  5.13it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12383/41242 [40:13<1:32:55,  5.18it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12383/41242 [40:13<1:32:55,  5.18it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12384/41242 [40:13<1:32:15,  5.21it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12384/41242 [40:13<1:32:15,  5.21it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12385/41242 [40:13<1:31:19,  5.27it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12385/41242 [40:14<1:31:19,  5.27it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12386/41242 [40:14<1:30:06,  5.34it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12386/41242 [40:14<1:30:06,  5.34it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12387/41242 [40:14<1:29:55,  5.35it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12387/41242 [40:14<1:29:55,  5.35it/s, training_loss=0.059]
Epoch 1:  30%|███       | 12388/41242 [40:14<1:29:41,  5.36it/s, training_loss=0.059]
Epoch 1:  30%|███       | 12388/41242 [40:14<1:29:41,  5.36it/s, training_loss=0.008]
Epoch 1:  30%|███       | 12389/41242 [40:14<1:29:10,  5.39it/s, training_loss=0.008]
Epoch 1:  30%|███       | 12389/41242 [40:14<1:29:10,  5.39it/s, training_loss=0.080]
Epoch 1:  30%|███       | 12390/41242 [40:14<1:30:10,  5.33it/s, training_loss=0.080]
Epoch 1:  30%|███       | 12390/41242 [40:14<1:30:10,  5.33it/s, training_loss=0.068]
Epoch 1:  30%|███       | 12391/41242 [40:14<1:30:43,  5.30it/s, training_loss=0.068]
Epoch 1:  30%|███       | 12391/41242 [40:15<1:30:43,  5.30it/s, training_loss=0.006]
Epoch 1:  30%|███       | 12392/41242 [40:15<1:29:44,  5.36it/s, training_loss=0.006]
Epoch 1:  30%|███       | 12392/41242 [40:15<1:29:44,  5.36it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12393/41242 [40:15<1:30:17,  5.33it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12393/41242 [40:15<1:30:17,  5.33it/s, training_loss=0.069]
Epoch 1:  30%|███       | 12394/41242 [40:15<1:32:07,  5.22it/s, training_loss=0.069]
Epoch 1:  30%|███       | 12394/41242 [40:15<1:32:07,  5.22it/s, training_loss=0.001]
Epoch 1:  30%|███       | 12395/41242 [40:15<1:31:41,  5.24it/s, training_loss=0.001]
Epoch 1:  30%|███       | 12395/41242 [40:15<1:31:41,  5.24it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12396/41242 [40:15<1:31:14,  5.27it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12396/41242 [40:16<1:31:14,  5.27it/s, training_loss=0.007]
Epoch 1:  30%|███       | 12397/41242 [40:16<1:30:26,  5.32it/s, training_loss=0.007]
Epoch 1:  30%|███       | 12397/41242 [40:16<1:30:26,  5.32it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12398/41242 [40:16<1:29:44,  5.36it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12398/41242 [40:16<1:29:44,  5.36it/s, training_loss=0.009]
Epoch 1:  30%|███       | 12399/41242 [40:16<1:29:49,  5.35it/s, training_loss=0.009]
Epoch 1:  30%|███       | 12399/41242 [40:16<1:29:49,  5.35it/s, training_loss=0.042]
Epoch 1:  30%|███       | 12400/41242 [40:16<1:30:31,  5.31it/s, training_loss=0.042]
Epoch 1:  30%|███       | 12400/41242 [40:16<1:30:31,  5.31it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12401/41242 [40:16<1:31:09,  5.27it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12401/41242 [40:17<1:31:09,  5.27it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12402/41242 [40:17<1:31:25,  5.26it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12402/41242 [40:17<1:31:25,  5.26it/s, training_loss=0.001]
Epoch 1:  30%|███       | 12403/41242 [40:17<1:31:04,  5.28it/s, training_loss=0.001]
Epoch 1:  30%|███       | 12403/41242 [40:17<1:31:04,  5.28it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12404/41242 [40:17<1:30:06,  5.33it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12404/41242 [40:17<1:30:06,  5.33it/s, training_loss=0.026]
Epoch 1:  30%|███       | 12405/41242 [40:17<1:30:02,  5.34it/s, training_loss=0.026]
Epoch 1:  30%|███       | 12405/41242 [40:17<1:30:02,  5.34it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12406/41242 [40:17<1:29:22,  5.38it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12406/41242 [40:18<1:29:22,  5.38it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12407/41242 [40:18<1:29:37,  5.36it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12407/41242 [40:18<1:29:37,  5.36it/s, training_loss=0.009]
Epoch 1:  30%|███       | 12408/41242 [40:18<1:31:49,  5.23it/s, training_loss=0.009]
Epoch 1:  30%|███       | 12408/41242 [40:18<1:31:49,  5.23it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12409/41242 [40:18<1:31:40,  5.24it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12409/41242 [40:18<1:31:40,  5.24it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12410/41242 [40:18<1:31:24,  5.26it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12410/41242 [40:18<1:31:24,  5.26it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12411/41242 [40:18<1:30:35,  5.30it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12411/41242 [40:18<1:30:35,  5.30it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12412/41242 [40:18<1:30:17,  5.32it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12412/41242 [40:19<1:30:17,  5.32it/s, training_loss=0.001]
Epoch 1:  30%|███       | 12413/41242 [40:19<1:31:13,  5.27it/s, training_loss=0.001]
Epoch 1:  30%|███       | 12413/41242 [40:19<1:31:13,  5.27it/s, training_loss=0.148]
Epoch 1:  30%|███       | 12414/41242 [40:19<1:30:42,  5.30it/s, training_loss=0.148]
Epoch 1:  30%|███       | 12414/41242 [40:19<1:30:42,  5.30it/s, training_loss=0.077]
Epoch 1:  30%|███       | 12415/41242 [40:19<1:30:32,  5.31it/s, training_loss=0.077]
Epoch 1:  30%|███       | 12415/41242 [40:19<1:30:32,  5.31it/s, training_loss=0.001]
Epoch 1:  30%|███       | 12416/41242 [40:19<1:30:36,  5.30it/s, training_loss=0.001]
Epoch 1:  30%|███       | 12416/41242 [40:19<1:30:36,  5.30it/s, training_loss=0.241]
Epoch 1:  30%|███       | 12417/41242 [40:19<1:31:12,  5.27it/s, training_loss=0.241]
Epoch 1:  30%|███       | 12417/41242 [40:20<1:31:12,  5.27it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12418/41242 [40:20<1:31:08,  5.27it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12418/41242 [40:20<1:31:08,  5.27it/s, training_loss=0.254]
Epoch 1:  30%|███       | 12419/41242 [40:20<1:32:37,  5.19it/s, training_loss=0.254]
Epoch 1:  30%|███       | 12419/41242 [40:20<1:32:37,  5.19it/s, training_loss=0.001]
Epoch 1:  30%|███       | 12420/41242 [40:20<1:34:52,  5.06it/s, training_loss=0.001]
Epoch 1:  30%|███       | 12420/41242 [40:20<1:34:52,  5.06it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12421/41242 [40:20<1:33:12,  5.15it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12421/41242 [40:20<1:33:12,  5.15it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12422/41242 [40:20<1:31:45,  5.23it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12422/41242 [40:21<1:31:45,  5.23it/s, training_loss=0.017]
Epoch 1:  30%|███       | 12423/41242 [40:21<1:33:45,  5.12it/s, training_loss=0.017]
Epoch 1:  30%|███       | 12423/41242 [40:21<1:33:45,  5.12it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12424/41242 [40:21<1:32:34,  5.19it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12424/41242 [40:21<1:32:34,  5.19it/s, training_loss=0.008]
Epoch 1:  30%|███       | 12425/41242 [40:21<1:32:53,  5.17it/s, training_loss=0.008]
Epoch 1:  30%|███       | 12425/41242 [40:21<1:32:53,  5.17it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12426/41242 [40:21<1:32:58,  5.17it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12426/41242 [40:21<1:32:58,  5.17it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12427/41242 [40:21<1:33:15,  5.15it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12427/41242 [40:22<1:33:15,  5.15it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12428/41242 [40:22<1:33:32,  5.13it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12428/41242 [40:22<1:33:32,  5.13it/s, training_loss=0.026]
Epoch 1:  30%|███       | 12429/41242 [40:22<1:33:31,  5.13it/s, training_loss=0.026]
Epoch 1:  30%|███       | 12429/41242 [40:22<1:33:31,  5.13it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12430/41242 [40:22<1:32:15,  5.20it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12430/41242 [40:22<1:32:15,  5.20it/s, training_loss=0.012]
Epoch 1:  30%|███       | 12431/41242 [40:22<1:32:07,  5.21it/s, training_loss=0.012]
Epoch 1:  30%|███       | 12431/41242 [40:22<1:32:07,  5.21it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12432/41242 [40:22<1:30:59,  5.28it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12432/41242 [40:22<1:30:59,  5.28it/s, training_loss=0.031]
Epoch 1:  30%|███       | 12433/41242 [40:22<1:30:51,  5.28it/s, training_loss=0.031]
Epoch 1:  30%|███       | 12433/41242 [40:23<1:30:51,  5.28it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12434/41242 [40:23<1:29:51,  5.34it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12434/41242 [40:23<1:29:51,  5.34it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12435/41242 [40:23<1:30:19,  5.32it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12435/41242 [40:23<1:30:19,  5.32it/s, training_loss=0.236]
Epoch 1:  30%|███       | 12436/41242 [40:23<1:30:39,  5.30it/s, training_loss=0.236]
Epoch 1:  30%|███       | 12436/41242 [40:23<1:30:39,  5.30it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12437/41242 [40:23<1:29:42,  5.35it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12437/41242 [40:23<1:29:42,  5.35it/s, training_loss=0.131]
Epoch 1:  30%|███       | 12438/41242 [40:23<1:30:20,  5.31it/s, training_loss=0.131]
Epoch 1:  30%|███       | 12438/41242 [40:24<1:30:20,  5.31it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12439/41242 [40:24<1:31:13,  5.26it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12439/41242 [40:24<1:31:13,  5.26it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12440/41242 [40:24<1:30:17,  5.32it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12440/41242 [40:24<1:30:17,  5.32it/s, training_loss=0.774]
Epoch 1:  30%|███       | 12441/41242 [40:24<1:30:35,  5.30it/s, training_loss=0.774]
Epoch 1:  30%|███       | 12441/41242 [40:24<1:30:35,  5.30it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12442/41242 [40:24<1:30:54,  5.28it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12442/41242 [40:24<1:30:54,  5.28it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12443/41242 [40:24<1:32:17,  5.20it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12443/41242 [40:25<1:32:17,  5.20it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12444/41242 [40:25<1:31:17,  5.26it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12444/41242 [40:25<1:31:17,  5.26it/s, training_loss=0.211]
Epoch 1:  30%|███       | 12445/41242 [40:25<1:32:39,  5.18it/s, training_loss=0.211]
Epoch 1:  30%|███       | 12445/41242 [40:25<1:32:39,  5.18it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12446/41242 [40:25<1:34:56,  5.06it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12446/41242 [40:25<1:34:56,  5.06it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12447/41242 [40:25<1:33:20,  5.14it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12447/41242 [40:25<1:33:20,  5.14it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12448/41242 [40:25<1:31:49,  5.23it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12448/41242 [40:26<1:31:49,  5.23it/s, training_loss=0.008]
Epoch 1:  30%|███       | 12449/41242 [40:26<1:30:57,  5.28it/s, training_loss=0.008]
Epoch 1:  30%|███       | 12449/41242 [40:26<1:30:57,  5.28it/s, training_loss=0.161]
Epoch 1:  30%|███       | 12450/41242 [40:26<1:31:55,  5.22it/s, training_loss=0.161]
Epoch 1:  30%|███       | 12450/41242 [40:26<1:31:55,  5.22it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12451/41242 [40:26<1:31:13,  5.26it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12451/41242 [40:26<1:31:13,  5.26it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12452/41242 [40:26<1:30:26,  5.31it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12452/41242 [40:26<1:30:26,  5.31it/s, training_loss=0.008]
Epoch 1:  30%|███       | 12453/41242 [40:26<1:32:13,  5.20it/s, training_loss=0.008]
Epoch 1:  30%|███       | 12453/41242 [40:26<1:32:13,  5.20it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12454/41242 [40:27<1:33:25,  5.14it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12454/41242 [40:27<1:33:25,  5.14it/s, training_loss=0.018]
Epoch 1:  30%|███       | 12455/41242 [40:27<1:32:58,  5.16it/s, training_loss=0.018]
Epoch 1:  30%|███       | 12455/41242 [40:27<1:32:58,  5.16it/s, training_loss=0.129]
Epoch 1:  30%|███       | 12456/41242 [40:27<1:32:39,  5.18it/s, training_loss=0.129]
Epoch 1:  30%|███       | 12456/41242 [40:27<1:32:39,  5.18it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12457/41242 [40:27<1:31:07,  5.26it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12457/41242 [40:27<1:31:07,  5.26it/s, training_loss=0.198]
Epoch 1:  30%|███       | 12458/41242 [40:27<1:31:36,  5.24it/s, training_loss=0.198]
Epoch 1:  30%|███       | 12458/41242 [40:27<1:31:36,  5.24it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12459/41242 [40:27<1:31:28,  5.24it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12459/41242 [40:28<1:31:28,  5.24it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12460/41242 [40:28<1:31:31,  5.24it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12460/41242 [40:28<1:31:31,  5.24it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12461/41242 [40:28<1:34:29,  5.08it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12461/41242 [40:28<1:34:29,  5.08it/s, training_loss=0.260]
Epoch 1:  30%|███       | 12462/41242 [40:28<1:34:48,  5.06it/s, training_loss=0.260]
Epoch 1:  30%|███       | 12462/41242 [40:28<1:34:48,  5.06it/s, training_loss=0.281]
Epoch 1:  30%|███       | 12463/41242 [40:28<1:34:21,  5.08it/s, training_loss=0.281]
Epoch 1:  30%|███       | 12463/41242 [40:28<1:34:21,  5.08it/s, training_loss=0.306]
Epoch 1:  30%|███       | 12464/41242 [40:28<1:33:51,  5.11it/s, training_loss=0.306]
Epoch 1:  30%|███       | 12464/41242 [40:29<1:33:51,  5.11it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12465/41242 [40:29<1:32:32,  5.18it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12465/41242 [40:29<1:32:32,  5.18it/s, training_loss=0.001]
Epoch 1:  30%|███       | 12466/41242 [40:29<1:31:49,  5.22it/s, training_loss=0.001]
Epoch 1:  30%|███       | 12466/41242 [40:29<1:31:49,  5.22it/s, training_loss=0.028]
Epoch 1:  30%|███       | 12467/41242 [40:29<1:31:46,  5.23it/s, training_loss=0.028]
Epoch 1:  30%|███       | 12467/41242 [40:29<1:31:46,  5.23it/s, training_loss=0.015]
Epoch 1:  30%|███       | 12468/41242 [40:29<1:31:41,  5.23it/s, training_loss=0.015]
Epoch 1:  30%|███       | 12468/41242 [40:29<1:31:41,  5.23it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12469/41242 [40:29<1:30:13,  5.31it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12469/41242 [40:30<1:30:13,  5.31it/s, training_loss=0.193]
Epoch 1:  30%|███       | 12470/41242 [40:30<1:31:28,  5.24it/s, training_loss=0.193]
Epoch 1:  30%|███       | 12470/41242 [40:30<1:31:28,  5.24it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12471/41242 [40:30<1:30:58,  5.27it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12471/41242 [40:30<1:30:58,  5.27it/s, training_loss=0.096]
Epoch 1:  30%|███       | 12472/41242 [40:30<1:30:40,  5.29it/s, training_loss=0.096]
Epoch 1:  30%|███       | 12472/41242 [40:30<1:30:40,  5.29it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12473/41242 [40:30<1:30:10,  5.32it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12473/41242 [40:30<1:30:10,  5.32it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12474/41242 [40:30<1:29:59,  5.33it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12474/41242 [40:31<1:29:59,  5.33it/s, training_loss=0.008]
Epoch 1:  30%|███       | 12475/41242 [40:31<1:30:58,  5.27it/s, training_loss=0.008]
Epoch 1:  30%|███       | 12475/41242 [40:31<1:30:58,  5.27it/s, training_loss=0.010]
Epoch 1:  30%|███       | 12476/41242 [40:31<1:33:43,  5.12it/s, training_loss=0.010]
Epoch 1:  30%|███       | 12476/41242 [40:31<1:33:43,  5.12it/s, training_loss=0.008]
Epoch 1:  30%|███       | 12477/41242 [40:31<1:33:01,  5.15it/s, training_loss=0.008]
Epoch 1:  30%|███       | 12477/41242 [40:31<1:33:01,  5.15it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12478/41242 [40:31<1:32:27,  5.18it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12478/41242 [40:31<1:32:27,  5.18it/s, training_loss=0.028]
Epoch 1:  30%|███       | 12479/41242 [40:31<1:33:23,  5.13it/s, training_loss=0.028]
Epoch 1:  30%|███       | 12479/41242 [40:32<1:33:23,  5.13it/s, training_loss=0.025]
Epoch 1:  30%|███       | 12480/41242 [40:32<1:33:20,  5.14it/s, training_loss=0.025]
Epoch 1:  30%|███       | 12480/41242 [40:32<1:33:20,  5.14it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12481/41242 [40:32<1:33:45,  5.11it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12481/41242 [40:32<1:33:45,  5.11it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12482/41242 [40:32<1:33:04,  5.15it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12482/41242 [40:32<1:33:04,  5.15it/s, training_loss=0.021]
Epoch 1:  30%|███       | 12483/41242 [40:32<1:33:05,  5.15it/s, training_loss=0.021]
Epoch 1:  30%|███       | 12483/41242 [40:32<1:33:05,  5.15it/s, training_loss=0.350]
Epoch 1:  30%|███       | 12484/41242 [40:32<1:32:37,  5.17it/s, training_loss=0.350]
Epoch 1:  30%|███       | 12484/41242 [40:32<1:32:37,  5.17it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12485/41242 [40:32<1:31:56,  5.21it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12485/41242 [40:33<1:31:56,  5.21it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12486/41242 [40:33<1:30:44,  5.28it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12486/41242 [40:33<1:30:44,  5.28it/s, training_loss=0.334]
Epoch 1:  30%|███       | 12487/41242 [40:33<1:31:05,  5.26it/s, training_loss=0.334]
Epoch 1:  30%|███       | 12487/41242 [40:33<1:31:05,  5.26it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12488/41242 [40:33<1:30:30,  5.29it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12488/41242 [40:33<1:30:30,  5.29it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12489/41242 [40:33<1:29:41,  5.34it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12489/41242 [40:33<1:29:41,  5.34it/s, training_loss=0.188]
Epoch 1:  30%|███       | 12490/41242 [40:33<1:30:22,  5.30it/s, training_loss=0.188]
Epoch 1:  30%|███       | 12490/41242 [40:34<1:30:22,  5.30it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12491/41242 [40:34<1:32:56,  5.16it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12491/41242 [40:34<1:32:56,  5.16it/s, training_loss=0.025]
Epoch 1:  30%|███       | 12492/41242 [40:34<1:33:01,  5.15it/s, training_loss=0.025]
Epoch 1:  30%|███       | 12492/41242 [40:34<1:33:01,  5.15it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12493/41242 [40:34<1:31:30,  5.24it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12493/41242 [40:34<1:31:30,  5.24it/s, training_loss=0.048]
Epoch 1:  30%|███       | 12494/41242 [40:34<1:31:16,  5.25it/s, training_loss=0.048]
Epoch 1:  30%|███       | 12494/41242 [40:34<1:31:16,  5.25it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12495/41242 [40:34<1:30:44,  5.28it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12495/41242 [40:35<1:30:44,  5.28it/s, training_loss=0.871]
Epoch 1:  30%|███       | 12496/41242 [40:35<1:30:59,  5.27it/s, training_loss=0.871]
Epoch 1:  30%|███       | 12496/41242 [40:35<1:30:59,  5.27it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12497/41242 [40:35<1:30:28,  5.29it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12497/41242 [40:35<1:30:28,  5.29it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12498/41242 [40:35<1:30:33,  5.29it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12498/41242 [40:35<1:30:33,  5.29it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12499/41242 [40:35<1:30:28,  5.29it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12499/41242 [40:35<1:30:28,  5.29it/s, training_loss=0.011]
Epoch 1:  30%|███       | 12500/41242 [40:35<1:30:37,  5.29it/s, training_loss=0.011]
Epoch 1:  30%|███       | 12500/41242 [40:36<1:30:37,  5.29it/s, training_loss=0.970]
Epoch 1:  30%|███       | 12501/41242 [40:36<1:31:46,  5.22it/s, training_loss=0.970]
Epoch 1:  30%|███       | 12501/41242 [40:36<1:31:46,  5.22it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12502/41242 [40:36<1:30:59,  5.26it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12502/41242 [40:36<1:30:59,  5.26it/s, training_loss=0.006]
Epoch 1:  30%|███       | 12503/41242 [40:36<1:31:43,  5.22it/s, training_loss=0.006]
Epoch 1:  30%|███       | 12503/41242 [40:36<1:31:43,  5.22it/s, training_loss=0.016]
Epoch 1:  30%|███       | 12504/41242 [40:36<1:32:00,  5.21it/s, training_loss=0.016]
Epoch 1:  30%|███       | 12504/41242 [40:36<1:32:00,  5.21it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12505/41242 [40:36<1:32:37,  5.17it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12505/41242 [40:36<1:32:37,  5.17it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12506/41242 [40:36<1:31:39,  5.23it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12506/41242 [40:37<1:31:39,  5.23it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12507/41242 [40:37<1:31:25,  5.24it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12507/41242 [40:37<1:31:25,  5.24it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12508/41242 [40:37<1:31:30,  5.23it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12508/41242 [40:37<1:31:30,  5.23it/s, training_loss=0.001]
Epoch 1:  30%|███       | 12509/41242 [40:37<1:31:03,  5.26it/s, training_loss=0.001]
Epoch 1:  30%|███       | 12509/41242 [40:37<1:31:03,  5.26it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12510/41242 [40:37<1:30:44,  5.28it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12510/41242 [40:37<1:30:44,  5.28it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12511/41242 [40:37<1:30:13,  5.31it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12511/41242 [40:38<1:30:13,  5.31it/s, training_loss=0.481]
Epoch 1:  30%|███       | 12512/41242 [40:38<1:31:10,  5.25it/s, training_loss=0.481]
Epoch 1:  30%|███       | 12512/41242 [40:38<1:31:10,  5.25it/s, training_loss=0.182]
Epoch 1:  30%|███       | 12513/41242 [40:38<1:30:41,  5.28it/s, training_loss=0.182]
Epoch 1:  30%|███       | 12513/41242 [40:38<1:30:41,  5.28it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12514/41242 [40:38<1:30:08,  5.31it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12514/41242 [40:38<1:30:08,  5.31it/s, training_loss=0.006]
Epoch 1:  30%|███       | 12515/41242 [40:38<1:29:35,  5.34it/s, training_loss=0.006]
Epoch 1:  30%|███       | 12515/41242 [40:38<1:29:35,  5.34it/s, training_loss=0.008]
Epoch 1:  30%|███       | 12516/41242 [40:38<1:30:44,  5.28it/s, training_loss=0.008]
Epoch 1:  30%|███       | 12516/41242 [40:39<1:30:44,  5.28it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12517/41242 [40:39<1:30:55,  5.27it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12517/41242 [40:39<1:30:55,  5.27it/s, training_loss=0.014]
Epoch 1:  30%|███       | 12518/41242 [40:39<1:31:21,  5.24it/s, training_loss=0.014]
Epoch 1:  30%|███       | 12518/41242 [40:39<1:31:21,  5.24it/s, training_loss=0.598]
Epoch 1:  30%|███       | 12519/41242 [40:39<1:32:25,  5.18it/s, training_loss=0.598]
Epoch 1:  30%|███       | 12519/41242 [40:39<1:32:25,  5.18it/s, training_loss=0.438]
Epoch 1:  30%|███       | 12520/41242 [40:39<1:31:56,  5.21it/s, training_loss=0.438]
Epoch 1:  30%|███       | 12520/41242 [40:39<1:31:56,  5.21it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12521/41242 [40:39<1:31:48,  5.21it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12521/41242 [40:40<1:31:48,  5.21it/s, training_loss=0.988]
Epoch 1:  30%|███       | 12522/41242 [40:40<1:33:03,  5.14it/s, training_loss=0.988]
Epoch 1:  30%|███       | 12522/41242 [40:40<1:33:03,  5.14it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12523/41242 [40:40<1:32:38,  5.17it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12523/41242 [40:40<1:32:38,  5.17it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12524/41242 [40:40<1:33:15,  5.13it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12524/41242 [40:40<1:33:15,  5.13it/s, training_loss=0.001]
Epoch 1:  30%|███       | 12525/41242 [40:40<1:34:06,  5.09it/s, training_loss=0.001]
Epoch 1:  30%|███       | 12525/41242 [40:40<1:34:06,  5.09it/s, training_loss=0.059]
Epoch 1:  30%|███       | 12526/41242 [40:40<1:34:15,  5.08it/s, training_loss=0.059]
Epoch 1:  30%|███       | 12526/41242 [40:40<1:34:15,  5.08it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12527/41242 [40:40<1:33:15,  5.13it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12527/41242 [40:41<1:33:15,  5.13it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12528/41242 [40:41<1:31:50,  5.21it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12528/41242 [40:41<1:31:50,  5.21it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12529/41242 [40:41<1:30:32,  5.29it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12529/41242 [40:41<1:30:32,  5.29it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12530/41242 [40:41<1:32:02,  5.20it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12530/41242 [40:41<1:32:02,  5.20it/s, training_loss=0.476]
Epoch 1:  30%|███       | 12531/41242 [40:41<1:32:08,  5.19it/s, training_loss=0.476]
Epoch 1:  30%|███       | 12531/41242 [40:41<1:32:08,  5.19it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12532/41242 [40:41<1:34:06,  5.08it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12532/41242 [40:42<1:34:06,  5.08it/s, training_loss=0.761]
Epoch 1:  30%|███       | 12533/41242 [40:42<1:33:27,  5.12it/s, training_loss=0.761]
Epoch 1:  30%|███       | 12533/41242 [40:42<1:33:27,  5.12it/s, training_loss=0.028]
Epoch 1:  30%|███       | 12534/41242 [40:42<1:32:24,  5.18it/s, training_loss=0.028]
Epoch 1:  30%|███       | 12534/41242 [40:42<1:32:24,  5.18it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12535/41242 [40:42<1:31:51,  5.21it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12535/41242 [40:42<1:31:51,  5.21it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12536/41242 [40:42<1:31:35,  5.22it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12536/41242 [40:42<1:31:35,  5.22it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12537/41242 [40:42<1:30:43,  5.27it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12537/41242 [40:43<1:30:43,  5.27it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12538/41242 [40:43<1:29:30,  5.34it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12538/41242 [40:43<1:29:30,  5.34it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12539/41242 [40:43<1:30:37,  5.28it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12539/41242 [40:43<1:30:37,  5.28it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12540/41242 [40:43<1:30:08,  5.31it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12540/41242 [40:43<1:30:08,  5.31it/s, training_loss=0.110]
Epoch 1:  30%|███       | 12541/41242 [40:43<1:30:45,  5.27it/s, training_loss=0.110]
Epoch 1:  30%|███       | 12541/41242 [40:43<1:30:45,  5.27it/s, training_loss=0.054]
Epoch 1:  30%|███       | 12542/41242 [40:43<1:31:01,  5.26it/s, training_loss=0.054]
Epoch 1:  30%|███       | 12542/41242 [40:44<1:31:01,  5.26it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12543/41242 [40:44<1:30:26,  5.29it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12543/41242 [40:44<1:30:26,  5.29it/s, training_loss=0.006]
Epoch 1:  30%|███       | 12544/41242 [40:44<1:30:24,  5.29it/s, training_loss=0.006]
Epoch 1:  30%|███       | 12544/41242 [40:44<1:30:24,  5.29it/s, training_loss=0.971]
Epoch 1:  30%|███       | 12545/41242 [40:44<1:30:42,  5.27it/s, training_loss=0.971]
Epoch 1:  30%|███       | 12545/41242 [40:44<1:30:42,  5.27it/s, training_loss=0.254]
Epoch 1:  30%|███       | 12546/41242 [40:44<1:30:49,  5.27it/s, training_loss=0.254]
Epoch 1:  30%|███       | 12546/41242 [40:44<1:30:49,  5.27it/s, training_loss=0.248]
Epoch 1:  30%|███       | 12547/41242 [40:44<1:30:31,  5.28it/s, training_loss=0.248]
Epoch 1:  30%|███       | 12547/41242 [40:44<1:30:31,  5.28it/s, training_loss=0.023]
Epoch 1:  30%|███       | 12548/41242 [40:45<1:32:25,  5.17it/s, training_loss=0.023]
Epoch 1:  30%|███       | 12548/41242 [40:45<1:32:25,  5.17it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12549/41242 [40:45<1:31:49,  5.21it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12549/41242 [40:45<1:31:49,  5.21it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12550/41242 [40:45<1:30:44,  5.27it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12550/41242 [40:45<1:30:44,  5.27it/s, training_loss=0.014]
Epoch 1:  30%|███       | 12551/41242 [40:45<1:30:46,  5.27it/s, training_loss=0.014]
Epoch 1:  30%|███       | 12551/41242 [40:45<1:30:46,  5.27it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12552/41242 [40:45<1:30:42,  5.27it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12552/41242 [40:45<1:30:42,  5.27it/s, training_loss=0.006]
Epoch 1:  30%|███       | 12553/41242 [40:45<1:30:38,  5.28it/s, training_loss=0.006]
Epoch 1:  30%|███       | 12553/41242 [40:46<1:30:38,  5.28it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12554/41242 [40:46<1:30:06,  5.31it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12554/41242 [40:46<1:30:06,  5.31it/s, training_loss=0.039]
Epoch 1:  30%|███       | 12555/41242 [40:46<1:30:29,  5.28it/s, training_loss=0.039]
Epoch 1:  30%|███       | 12555/41242 [40:46<1:30:29,  5.28it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12556/41242 [40:46<1:30:15,  5.30it/s, training_loss=0.002]
Epoch 1:  30%|███       | 12556/41242 [40:46<1:30:15,  5.30it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12557/41242 [40:46<1:29:16,  5.35it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12557/41242 [40:46<1:29:16,  5.35it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12558/41242 [40:46<1:29:01,  5.37it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12558/41242 [40:47<1:29:01,  5.37it/s, training_loss=0.024]
Epoch 1:  30%|███       | 12559/41242 [40:47<1:29:57,  5.31it/s, training_loss=0.024]
Epoch 1:  30%|███       | 12559/41242 [40:47<1:29:57,  5.31it/s, training_loss=0.453]
Epoch 1:  30%|███       | 12560/41242 [40:47<1:29:45,  5.33it/s, training_loss=0.453]
Epoch 1:  30%|███       | 12560/41242 [40:47<1:29:45,  5.33it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12561/41242 [40:47<1:30:57,  5.25it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12561/41242 [40:47<1:30:57,  5.25it/s, training_loss=0.001]
Epoch 1:  30%|███       | 12562/41242 [40:47<1:30:31,  5.28it/s, training_loss=0.001]
Epoch 1:  30%|███       | 12562/41242 [40:47<1:30:31,  5.28it/s, training_loss=0.007]
Epoch 1:  30%|███       | 12563/41242 [40:47<1:30:17,  5.29it/s, training_loss=0.007]
Epoch 1:  30%|███       | 12563/41242 [40:48<1:30:17,  5.29it/s, training_loss=0.317]
Epoch 1:  30%|███       | 12564/41242 [40:48<1:30:14,  5.30it/s, training_loss=0.317]
Epoch 1:  30%|███       | 12564/41242 [40:48<1:30:14,  5.30it/s, training_loss=0.001]
Epoch 1:  30%|███       | 12565/41242 [40:48<1:29:41,  5.33it/s, training_loss=0.001]
Epoch 1:  30%|███       | 12565/41242 [40:48<1:29:41,  5.33it/s, training_loss=0.199]
Epoch 1:  30%|███       | 12566/41242 [40:48<1:29:43,  5.33it/s, training_loss=0.199]
Epoch 1:  30%|███       | 12566/41242 [40:48<1:29:43,  5.33it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12567/41242 [40:48<1:29:26,  5.34it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12567/41242 [40:48<1:29:26,  5.34it/s, training_loss=0.277]
Epoch 1:  30%|███       | 12568/41242 [40:48<1:29:58,  5.31it/s, training_loss=0.277]
Epoch 1:  30%|███       | 12568/41242 [40:48<1:29:58,  5.31it/s, training_loss=0.839]
Epoch 1:  30%|███       | 12569/41242 [40:48<1:30:05,  5.30it/s, training_loss=0.839]
Epoch 1:  30%|███       | 12569/41242 [40:49<1:30:05,  5.30it/s, training_loss=0.346]
Epoch 1:  30%|███       | 12570/41242 [40:49<1:30:16,  5.29it/s, training_loss=0.346]
Epoch 1:  30%|███       | 12570/41242 [40:49<1:30:16,  5.29it/s, training_loss=0.008]
Epoch 1:  30%|███       | 12571/41242 [40:49<1:31:18,  5.23it/s, training_loss=0.008]
Epoch 1:  30%|███       | 12571/41242 [40:49<1:31:18,  5.23it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12572/41242 [40:49<1:30:13,  5.30it/s, training_loss=0.004]
Epoch 1:  30%|███       | 12572/41242 [40:49<1:30:13,  5.30it/s, training_loss=0.017]
Epoch 1:  30%|███       | 12573/41242 [40:49<1:29:45,  5.32it/s, training_loss=0.017]
Epoch 1:  30%|███       | 12573/41242 [40:49<1:29:45,  5.32it/s, training_loss=0.111]
Epoch 1:  30%|███       | 12574/41242 [40:49<1:29:47,  5.32it/s, training_loss=0.111]
Epoch 1:  30%|███       | 12574/41242 [40:50<1:29:47,  5.32it/s, training_loss=0.006]
Epoch 1:  30%|███       | 12575/41242 [40:50<1:30:41,  5.27it/s, training_loss=0.006]
Epoch 1:  30%|███       | 12575/41242 [40:50<1:30:41,  5.27it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12576/41242 [40:50<1:30:18,  5.29it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12576/41242 [40:50<1:30:18,  5.29it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12577/41242 [40:50<1:30:40,  5.27it/s, training_loss=0.003]
Epoch 1:  30%|███       | 12577/41242 [40:50<1:30:40,  5.27it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12578/41242 [40:50<1:30:01,  5.31it/s, training_loss=0.005]
Epoch 1:  30%|███       | 12578/41242 [40:50<1:30:01,  5.31it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12579/41242 [40:50<1:29:39,  5.33it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12579/41242 [40:51<1:29:39,  5.33it/s, training_loss=0.009]
Epoch 1:  31%|███       | 12580/41242 [40:51<1:31:46,  5.21it/s, training_loss=0.009]
Epoch 1:  31%|███       | 12580/41242 [40:51<1:31:46,  5.21it/s, training_loss=0.009]
Epoch 1:  31%|███       | 12581/41242 [40:51<1:31:05,  5.24it/s, training_loss=0.009]
Epoch 1:  31%|███       | 12581/41242 [40:51<1:31:05,  5.24it/s, training_loss=0.045]
Epoch 1:  31%|███       | 12582/41242 [40:51<1:32:09,  5.18it/s, training_loss=0.045]
Epoch 1:  31%|███       | 12582/41242 [40:51<1:32:09,  5.18it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12583/41242 [40:51<1:32:06,  5.19it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12583/41242 [40:51<1:32:06,  5.19it/s, training_loss=0.528]
Epoch 1:  31%|███       | 12584/41242 [40:51<1:32:56,  5.14it/s, training_loss=0.528]
Epoch 1:  31%|███       | 12584/41242 [40:52<1:32:56,  5.14it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12585/41242 [40:52<1:32:38,  5.16it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12585/41242 [40:52<1:32:38,  5.16it/s, training_loss=0.028]
Epoch 1:  31%|███       | 12586/41242 [40:52<1:32:19,  5.17it/s, training_loss=0.028]
Epoch 1:  31%|███       | 12586/41242 [40:52<1:32:19,  5.17it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12587/41242 [40:52<1:31:21,  5.23it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12587/41242 [40:52<1:31:21,  5.23it/s, training_loss=0.009]
Epoch 1:  31%|███       | 12588/41242 [40:52<1:34:23,  5.06it/s, training_loss=0.009]
Epoch 1:  31%|███       | 12588/41242 [40:52<1:34:23,  5.06it/s, training_loss=0.016]
Epoch 1:  31%|███       | 12589/41242 [40:52<1:33:41,  5.10it/s, training_loss=0.016]
Epoch 1:  31%|███       | 12589/41242 [40:52<1:33:41,  5.10it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12590/41242 [40:52<1:32:51,  5.14it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12590/41242 [40:53<1:32:51,  5.14it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12591/41242 [40:53<1:31:11,  5.24it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12591/41242 [40:53<1:31:11,  5.24it/s, training_loss=0.019]
Epoch 1:  31%|███       | 12592/41242 [40:53<1:30:52,  5.25it/s, training_loss=0.019]
Epoch 1:  31%|███       | 12592/41242 [40:53<1:30:52,  5.25it/s, training_loss=0.259]
Epoch 1:  31%|███       | 12593/41242 [40:53<1:31:05,  5.24it/s, training_loss=0.259]
Epoch 1:  31%|███       | 12593/41242 [40:53<1:31:05,  5.24it/s, training_loss=0.028]
Epoch 1:  31%|███       | 12594/41242 [40:53<1:30:44,  5.26it/s, training_loss=0.028]
Epoch 1:  31%|███       | 12594/41242 [40:53<1:30:44,  5.26it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12595/41242 [40:53<1:29:33,  5.33it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12595/41242 [40:54<1:29:33,  5.33it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12596/41242 [40:54<1:29:55,  5.31it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12596/41242 [40:54<1:29:55,  5.31it/s, training_loss=0.013]
Epoch 1:  31%|███       | 12597/41242 [40:54<1:30:55,  5.25it/s, training_loss=0.013]
Epoch 1:  31%|███       | 12597/41242 [40:54<1:30:55,  5.25it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12598/41242 [40:54<1:31:17,  5.23it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12598/41242 [40:54<1:31:17,  5.23it/s, training_loss=0.014]
Epoch 1:  31%|███       | 12599/41242 [40:54<1:31:42,  5.21it/s, training_loss=0.014]
Epoch 1:  31%|███       | 12599/41242 [40:54<1:31:42,  5.21it/s, training_loss=0.021]
Epoch 1:  31%|███       | 12600/41242 [40:54<1:31:08,  5.24it/s, training_loss=0.021]
Epoch 1:  31%|███       | 12600/41242 [40:55<1:31:08,  5.24it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12601/41242 [40:55<1:30:10,  5.29it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12601/41242 [40:55<1:30:10,  5.29it/s, training_loss=0.258]
Epoch 1:  31%|███       | 12602/41242 [40:55<1:30:23,  5.28it/s, training_loss=0.258]
Epoch 1:  31%|███       | 12602/41242 [40:55<1:30:23,  5.28it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12603/41242 [40:55<1:30:49,  5.26it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12603/41242 [40:55<1:30:49,  5.26it/s, training_loss=0.710]
Epoch 1:  31%|███       | 12604/41242 [40:55<1:31:15,  5.23it/s, training_loss=0.710]
Epoch 1:  31%|███       | 12604/41242 [40:55<1:31:15,  5.23it/s, training_loss=0.042]
Epoch 1:  31%|███       | 12605/41242 [40:55<1:30:53,  5.25it/s, training_loss=0.042]
Epoch 1:  31%|███       | 12605/41242 [40:56<1:30:53,  5.25it/s, training_loss=0.019]
Epoch 1:  31%|███       | 12606/41242 [40:56<1:30:49,  5.25it/s, training_loss=0.019]
Epoch 1:  31%|███       | 12606/41242 [40:56<1:30:49,  5.25it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12607/41242 [40:56<1:30:36,  5.27it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12607/41242 [40:56<1:30:36,  5.27it/s, training_loss=0.030]
Epoch 1:  31%|███       | 12608/41242 [40:56<1:30:29,  5.27it/s, training_loss=0.030]
Epoch 1:  31%|███       | 12608/41242 [40:56<1:30:29,  5.27it/s, training_loss=0.322]
Epoch 1:  31%|███       | 12609/41242 [40:56<1:30:50,  5.25it/s, training_loss=0.322]
Epoch 1:  31%|███       | 12609/41242 [40:56<1:30:50,  5.25it/s, training_loss=0.680]
Epoch 1:  31%|███       | 12610/41242 [40:56<1:32:41,  5.15it/s, training_loss=0.680]
Epoch 1:  31%|███       | 12610/41242 [40:56<1:32:41,  5.15it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12611/41242 [40:56<1:32:46,  5.14it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12611/41242 [40:57<1:32:46,  5.14it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12612/41242 [40:57<1:32:22,  5.17it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12612/41242 [40:57<1:32:22,  5.17it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12613/41242 [40:57<1:32:30,  5.16it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12613/41242 [40:57<1:32:30,  5.16it/s, training_loss=0.015]
Epoch 1:  31%|███       | 12614/41242 [40:57<1:32:44,  5.14it/s, training_loss=0.015]
Epoch 1:  31%|███       | 12614/41242 [40:57<1:32:44,  5.14it/s, training_loss=0.027]
Epoch 1:  31%|███       | 12615/41242 [40:57<1:31:56,  5.19it/s, training_loss=0.027]
Epoch 1:  31%|███       | 12615/41242 [40:57<1:31:56,  5.19it/s, training_loss=0.007]
Epoch 1:  31%|███       | 12616/41242 [40:57<1:30:37,  5.26it/s, training_loss=0.007]
Epoch 1:  31%|███       | 12616/41242 [40:58<1:30:37,  5.26it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12617/41242 [40:58<1:30:12,  5.29it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12617/41242 [40:58<1:30:12,  5.29it/s, training_loss=0.149]
Epoch 1:  31%|███       | 12618/41242 [40:58<1:30:20,  5.28it/s, training_loss=0.149]
Epoch 1:  31%|███       | 12618/41242 [40:58<1:30:20,  5.28it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12619/41242 [40:58<1:30:33,  5.27it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12619/41242 [40:58<1:30:33,  5.27it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12620/41242 [40:58<1:31:48,  5.20it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12620/41242 [40:58<1:31:48,  5.20it/s, training_loss=0.331]
Epoch 1:  31%|███       | 12621/41242 [40:58<1:31:42,  5.20it/s, training_loss=0.331]
Epoch 1:  31%|███       | 12621/41242 [40:59<1:31:42,  5.20it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12622/41242 [40:59<1:31:00,  5.24it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12622/41242 [40:59<1:31:00,  5.24it/s, training_loss=0.168]
Epoch 1:  31%|███       | 12623/41242 [40:59<1:32:39,  5.15it/s, training_loss=0.168]
Epoch 1:  31%|███       | 12623/41242 [40:59<1:32:39,  5.15it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12624/41242 [40:59<1:35:54,  4.97it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12624/41242 [40:59<1:35:54,  4.97it/s, training_loss=0.355]
Epoch 1:  31%|███       | 12625/41242 [40:59<1:35:36,  4.99it/s, training_loss=0.355]
Epoch 1:  31%|███       | 12625/41242 [40:59<1:35:36,  4.99it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12626/41242 [40:59<1:35:02,  5.02it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12626/41242 [41:00<1:35:02,  5.02it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12627/41242 [41:00<1:33:46,  5.09it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12627/41242 [41:00<1:33:46,  5.09it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12628/41242 [41:00<1:33:51,  5.08it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12628/41242 [41:00<1:33:51,  5.08it/s, training_loss=0.871]
Epoch 1:  31%|███       | 12629/41242 [41:00<1:34:23,  5.05it/s, training_loss=0.871]
Epoch 1:  31%|███       | 12629/41242 [41:00<1:34:23,  5.05it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12630/41242 [41:00<1:35:59,  4.97it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12630/41242 [41:00<1:35:59,  4.97it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12631/41242 [41:00<1:35:02,  5.02it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12631/41242 [41:01<1:35:02,  5.02it/s, training_loss=0.013]
Epoch 1:  31%|███       | 12632/41242 [41:01<1:34:11,  5.06it/s, training_loss=0.013]
Epoch 1:  31%|███       | 12632/41242 [41:01<1:34:11,  5.06it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12633/41242 [41:01<1:33:47,  5.08it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12633/41242 [41:01<1:33:47,  5.08it/s, training_loss=0.060]
Epoch 1:  31%|███       | 12634/41242 [41:01<1:34:05,  5.07it/s, training_loss=0.060]
Epoch 1:  31%|███       | 12634/41242 [41:01<1:34:05,  5.07it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12635/41242 [41:01<1:33:40,  5.09it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12635/41242 [41:01<1:33:40,  5.09it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12636/41242 [41:01<1:32:57,  5.13it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12636/41242 [41:02<1:32:57,  5.13it/s, training_loss=0.277]
Epoch 1:  31%|███       | 12637/41242 [41:02<1:31:45,  5.20it/s, training_loss=0.277]
Epoch 1:  31%|███       | 12637/41242 [41:02<1:31:45,  5.20it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12638/41242 [41:02<1:30:02,  5.29it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12638/41242 [41:02<1:30:02,  5.29it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12639/41242 [41:02<1:29:40,  5.32it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12639/41242 [41:02<1:29:40,  5.32it/s, training_loss=0.008]
Epoch 1:  31%|███       | 12640/41242 [41:02<1:31:13,  5.23it/s, training_loss=0.008]
Epoch 1:  31%|███       | 12640/41242 [41:02<1:31:13,  5.23it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12641/41242 [41:02<1:31:08,  5.23it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12641/41242 [41:03<1:31:08,  5.23it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12642/41242 [41:03<1:31:19,  5.22it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12642/41242 [41:03<1:31:19,  5.22it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12643/41242 [41:03<1:30:22,  5.27it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12643/41242 [41:03<1:30:22,  5.27it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12644/41242 [41:03<1:29:58,  5.30it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12644/41242 [41:03<1:29:58,  5.30it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12645/41242 [41:03<1:29:16,  5.34it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12645/41242 [41:03<1:29:16,  5.34it/s, training_loss=0.053]
Epoch 1:  31%|███       | 12646/41242 [41:03<1:30:50,  5.25it/s, training_loss=0.053]
Epoch 1:  31%|███       | 12646/41242 [41:03<1:30:50,  5.25it/s, training_loss=0.893]
Epoch 1:  31%|███       | 12647/41242 [41:03<1:31:41,  5.20it/s, training_loss=0.893]
Epoch 1:  31%|███       | 12647/41242 [41:04<1:31:41,  5.20it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12648/41242 [41:04<1:31:23,  5.21it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12648/41242 [41:04<1:31:23,  5.21it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12649/41242 [41:04<1:30:13,  5.28it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12649/41242 [41:04<1:30:13,  5.28it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12650/41242 [41:04<1:29:57,  5.30it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12650/41242 [41:04<1:29:57,  5.30it/s, training_loss=0.869]
Epoch 1:  31%|███       | 12651/41242 [41:04<1:30:07,  5.29it/s, training_loss=0.869]
Epoch 1:  31%|███       | 12651/41242 [41:04<1:30:07,  5.29it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12652/41242 [41:04<1:32:56,  5.13it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12652/41242 [41:05<1:32:56,  5.13it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12653/41242 [41:05<1:33:10,  5.11it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12653/41242 [41:05<1:33:10,  5.11it/s, training_loss=0.525]
Epoch 1:  31%|███       | 12654/41242 [41:05<1:32:35,  5.15it/s, training_loss=0.525]
Epoch 1:  31%|███       | 12654/41242 [41:05<1:32:35,  5.15it/s, training_loss=0.126]
Epoch 1:  31%|███       | 12655/41242 [41:05<1:33:01,  5.12it/s, training_loss=0.126]
Epoch 1:  31%|███       | 12655/41242 [41:05<1:33:01,  5.12it/s, training_loss=0.346]
Epoch 1:  31%|███       | 12656/41242 [41:05<1:32:58,  5.12it/s, training_loss=0.346]
Epoch 1:  31%|███       | 12656/41242 [41:05<1:32:58,  5.12it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12657/41242 [41:05<1:32:48,  5.13it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12657/41242 [41:06<1:32:48,  5.13it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12658/41242 [41:06<1:31:39,  5.20it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12658/41242 [41:06<1:31:39,  5.20it/s, training_loss=0.060]
Epoch 1:  31%|███       | 12659/41242 [41:06<1:32:58,  5.12it/s, training_loss=0.060]
Epoch 1:  31%|███       | 12659/41242 [41:06<1:32:58,  5.12it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12660/41242 [41:06<1:33:19,  5.10it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12660/41242 [41:06<1:33:19,  5.10it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12661/41242 [41:06<1:32:06,  5.17it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12661/41242 [41:06<1:32:06,  5.17it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12662/41242 [41:06<1:32:01,  5.18it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12662/41242 [41:07<1:32:01,  5.18it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12663/41242 [41:07<1:32:47,  5.13it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12663/41242 [41:07<1:32:47,  5.13it/s, training_loss=0.593]
Epoch 1:  31%|███       | 12664/41242 [41:07<1:33:46,  5.08it/s, training_loss=0.593]
Epoch 1:  31%|███       | 12664/41242 [41:07<1:33:46,  5.08it/s, training_loss=0.333]
Epoch 1:  31%|███       | 12665/41242 [41:07<1:34:19,  5.05it/s, training_loss=0.333]
Epoch 1:  31%|███       | 12665/41242 [41:07<1:34:19,  5.05it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12666/41242 [41:07<1:33:03,  5.12it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12666/41242 [41:07<1:33:03,  5.12it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12667/41242 [41:07<1:33:31,  5.09it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12667/41242 [41:08<1:33:31,  5.09it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12668/41242 [41:08<1:33:20,  5.10it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12668/41242 [41:08<1:33:20,  5.10it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12669/41242 [41:08<1:31:50,  5.18it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12669/41242 [41:08<1:31:50,  5.18it/s, training_loss=0.007]
Epoch 1:  31%|███       | 12670/41242 [41:08<1:33:12,  5.11it/s, training_loss=0.007]
Epoch 1:  31%|███       | 12670/41242 [41:08<1:33:12,  5.11it/s, training_loss=0.020]
Epoch 1:  31%|███       | 12671/41242 [41:08<1:32:04,  5.17it/s, training_loss=0.020]
Epoch 1:  31%|███       | 12671/41242 [41:08<1:32:04,  5.17it/s, training_loss=0.017]
Epoch 1:  31%|███       | 12672/41242 [41:08<1:32:04,  5.17it/s, training_loss=0.017]
Epoch 1:  31%|███       | 12672/41242 [41:08<1:32:04,  5.17it/s, training_loss=0.023]
Epoch 1:  31%|███       | 12673/41242 [41:08<1:31:34,  5.20it/s, training_loss=0.023]
Epoch 1:  31%|███       | 12673/41242 [41:09<1:31:34,  5.20it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12674/41242 [41:09<1:30:49,  5.24it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12674/41242 [41:09<1:30:49,  5.24it/s, training_loss=0.024]
Epoch 1:  31%|███       | 12675/41242 [41:09<1:31:09,  5.22it/s, training_loss=0.024]
Epoch 1:  31%|███       | 12675/41242 [41:09<1:31:09,  5.22it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12676/41242 [41:09<1:30:30,  5.26it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12676/41242 [41:09<1:30:30,  5.26it/s, training_loss=0.010]
Epoch 1:  31%|███       | 12677/41242 [41:09<1:30:24,  5.27it/s, training_loss=0.010]
Epoch 1:  31%|███       | 12677/41242 [41:09<1:30:24,  5.27it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12678/41242 [41:09<1:30:01,  5.29it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12678/41242 [41:10<1:30:01,  5.29it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12679/41242 [41:10<1:29:37,  5.31it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12679/41242 [41:10<1:29:37,  5.31it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12680/41242 [41:10<1:28:54,  5.35it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12680/41242 [41:10<1:28:54,  5.35it/s, training_loss=0.007]
Epoch 1:  31%|███       | 12681/41242 [41:10<1:30:48,  5.24it/s, training_loss=0.007]
Epoch 1:  31%|███       | 12681/41242 [41:10<1:30:48,  5.24it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12682/41242 [41:10<1:33:02,  5.12it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12682/41242 [41:10<1:33:02,  5.12it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12683/41242 [41:10<1:32:25,  5.15it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12683/41242 [41:11<1:32:25,  5.15it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12684/41242 [41:11<1:32:02,  5.17it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12684/41242 [41:11<1:32:02,  5.17it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12685/41242 [41:11<1:30:56,  5.23it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12685/41242 [41:11<1:30:56,  5.23it/s, training_loss=0.007]
Epoch 1:  31%|███       | 12686/41242 [41:11<1:29:44,  5.30it/s, training_loss=0.007]
Epoch 1:  31%|███       | 12686/41242 [41:11<1:29:44,  5.30it/s, training_loss=0.018]
Epoch 1:  31%|███       | 12687/41242 [41:11<1:29:47,  5.30it/s, training_loss=0.018]
Epoch 1:  31%|███       | 12687/41242 [41:11<1:29:47,  5.30it/s, training_loss=0.025]
Epoch 1:  31%|███       | 12688/41242 [41:11<1:31:33,  5.20it/s, training_loss=0.025]
Epoch 1:  31%|███       | 12688/41242 [41:12<1:31:33,  5.20it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12689/41242 [41:12<1:30:27,  5.26it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12689/41242 [41:12<1:30:27,  5.26it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12690/41242 [41:12<1:30:06,  5.28it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12690/41242 [41:12<1:30:06,  5.28it/s, training_loss=0.038]
Epoch 1:  31%|███       | 12691/41242 [41:12<1:30:02,  5.28it/s, training_loss=0.038]
Epoch 1:  31%|███       | 12691/41242 [41:12<1:30:02,  5.28it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12692/41242 [41:12<1:29:00,  5.35it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12692/41242 [41:12<1:29:00,  5.35it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12693/41242 [41:12<1:28:33,  5.37it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12693/41242 [41:12<1:28:33,  5.37it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12694/41242 [41:12<1:28:20,  5.39it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12694/41242 [41:13<1:28:20,  5.39it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12695/41242 [41:13<1:28:12,  5.39it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12695/41242 [41:13<1:28:12,  5.39it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12696/41242 [41:13<1:30:22,  5.26it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12696/41242 [41:13<1:30:22,  5.26it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12697/41242 [41:13<1:30:27,  5.26it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12697/41242 [41:13<1:30:27,  5.26it/s, training_loss=0.028]
Epoch 1:  31%|███       | 12698/41242 [41:13<1:31:29,  5.20it/s, training_loss=0.028]
Epoch 1:  31%|███       | 12698/41242 [41:13<1:31:29,  5.20it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12699/41242 [41:13<1:31:55,  5.18it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12699/41242 [41:14<1:31:55,  5.18it/s, training_loss=0.076]
Epoch 1:  31%|███       | 12700/41242 [41:14<1:33:09,  5.11it/s, training_loss=0.076]
Epoch 1:  31%|███       | 12700/41242 [41:14<1:33:09,  5.11it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12701/41242 [41:14<1:33:42,  5.08it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12701/41242 [41:14<1:33:42,  5.08it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12702/41242 [41:14<1:33:28,  5.09it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12702/41242 [41:14<1:33:28,  5.09it/s, training_loss=0.024]
Epoch 1:  31%|███       | 12703/41242 [41:14<1:33:24,  5.09it/s, training_loss=0.024]
Epoch 1:  31%|███       | 12703/41242 [41:14<1:33:24,  5.09it/s, training_loss=0.138]
Epoch 1:  31%|███       | 12704/41242 [41:14<1:32:31,  5.14it/s, training_loss=0.138]
Epoch 1:  31%|███       | 12704/41242 [41:15<1:32:31,  5.14it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12705/41242 [41:15<1:33:06,  5.11it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12705/41242 [41:15<1:33:06,  5.11it/s, training_loss=0.091]
Epoch 1:  31%|███       | 12706/41242 [41:15<1:33:27,  5.09it/s, training_loss=0.091]
Epoch 1:  31%|███       | 12706/41242 [41:15<1:33:27,  5.09it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12707/41242 [41:15<1:33:44,  5.07it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12707/41242 [41:15<1:33:44,  5.07it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12708/41242 [41:15<1:32:09,  5.16it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12708/41242 [41:15<1:32:09,  5.16it/s, training_loss=0.027]
Epoch 1:  31%|███       | 12709/41242 [41:15<1:36:18,  4.94it/s, training_loss=0.027]
Epoch 1:  31%|███       | 12709/41242 [41:16<1:36:18,  4.94it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12710/41242 [41:16<1:35:12,  4.99it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12710/41242 [41:16<1:35:12,  4.99it/s, training_loss=0.258]
Epoch 1:  31%|███       | 12711/41242 [41:16<1:35:17,  4.99it/s, training_loss=0.258]
Epoch 1:  31%|███       | 12711/41242 [41:16<1:35:17,  4.99it/s, training_loss=0.015]
Epoch 1:  31%|███       | 12712/41242 [41:16<1:34:46,  5.02it/s, training_loss=0.015]
Epoch 1:  31%|███       | 12712/41242 [41:16<1:34:46,  5.02it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12713/41242 [41:16<1:33:12,  5.10it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12713/41242 [41:16<1:33:12,  5.10it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12714/41242 [41:16<1:33:01,  5.11it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12714/41242 [41:17<1:33:01,  5.11it/s, training_loss=0.340]
Epoch 1:  31%|███       | 12715/41242 [41:17<1:31:40,  5.19it/s, training_loss=0.340]
Epoch 1:  31%|███       | 12715/41242 [41:17<1:31:40,  5.19it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12716/41242 [41:17<1:30:31,  5.25it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12716/41242 [41:17<1:30:31,  5.25it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12717/41242 [41:17<1:29:53,  5.29it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12717/41242 [41:17<1:29:53,  5.29it/s, training_loss=0.012]
Epoch 1:  31%|███       | 12718/41242 [41:17<1:31:11,  5.21it/s, training_loss=0.012]
Epoch 1:  31%|███       | 12718/41242 [41:17<1:31:11,  5.21it/s, training_loss=0.799]
Epoch 1:  31%|███       | 12719/41242 [41:17<1:32:32,  5.14it/s, training_loss=0.799]
Epoch 1:  31%|███       | 12719/41242 [41:18<1:32:32,  5.14it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12720/41242 [41:18<1:32:27,  5.14it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12720/41242 [41:18<1:32:27,  5.14it/s, training_loss=0.142]
Epoch 1:  31%|███       | 12721/41242 [41:18<1:32:48,  5.12it/s, training_loss=0.142]
Epoch 1:  31%|███       | 12721/41242 [41:18<1:32:48,  5.12it/s, training_loss=0.009]
Epoch 1:  31%|███       | 12722/41242 [41:18<1:34:34,  5.03it/s, training_loss=0.009]
Epoch 1:  31%|███       | 12722/41242 [41:18<1:34:34,  5.03it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12723/41242 [41:18<1:33:56,  5.06it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12723/41242 [41:18<1:33:56,  5.06it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12724/41242 [41:18<1:35:09,  4.99it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12724/41242 [41:19<1:35:09,  4.99it/s, training_loss=0.678]
Epoch 1:  31%|███       | 12725/41242 [41:19<1:35:20,  4.98it/s, training_loss=0.678]
Epoch 1:  31%|███       | 12725/41242 [41:19<1:35:20,  4.98it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12726/41242 [41:19<1:34:20,  5.04it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12726/41242 [41:19<1:34:20,  5.04it/s, training_loss=0.664]
Epoch 1:  31%|███       | 12727/41242 [41:19<1:35:48,  4.96it/s, training_loss=0.664]
Epoch 1:  31%|███       | 12727/41242 [41:19<1:35:48,  4.96it/s, training_loss=0.405]
Epoch 1:  31%|███       | 12728/41242 [41:19<1:34:55,  5.01it/s, training_loss=0.405]
Epoch 1:  31%|███       | 12728/41242 [41:19<1:34:55,  5.01it/s, training_loss=0.264]
Epoch 1:  31%|███       | 12729/41242 [41:19<1:34:40,  5.02it/s, training_loss=0.264]
Epoch 1:  31%|███       | 12729/41242 [41:20<1:34:40,  5.02it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12730/41242 [41:20<1:34:52,  5.01it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12730/41242 [41:20<1:34:52,  5.01it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12731/41242 [41:20<1:35:00,  5.00it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12731/41242 [41:20<1:35:00,  5.00it/s, training_loss=0.206]
Epoch 1:  31%|███       | 12732/41242 [41:20<1:34:55,  5.01it/s, training_loss=0.206]
Epoch 1:  31%|███       | 12732/41242 [41:20<1:34:55,  5.01it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12733/41242 [41:20<1:33:23,  5.09it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12733/41242 [41:20<1:33:23,  5.09it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12734/41242 [41:20<1:32:06,  5.16it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12734/41242 [41:21<1:32:06,  5.16it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12735/41242 [41:21<1:31:01,  5.22it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12735/41242 [41:21<1:31:01,  5.22it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12736/41242 [41:21<1:29:58,  5.28it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12736/41242 [41:21<1:29:58,  5.28it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12737/41242 [41:21<1:29:26,  5.31it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12737/41242 [41:21<1:29:26,  5.31it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12738/41242 [41:21<1:28:45,  5.35it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12738/41242 [41:21<1:28:45,  5.35it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12739/41242 [41:21<1:28:04,  5.39it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12739/41242 [41:21<1:28:04,  5.39it/s, training_loss=0.122]
Epoch 1:  31%|███       | 12740/41242 [41:21<1:28:21,  5.38it/s, training_loss=0.122]
Epoch 1:  31%|███       | 12740/41242 [41:22<1:28:21,  5.38it/s, training_loss=0.041]
Epoch 1:  31%|███       | 12741/41242 [41:22<1:29:37,  5.30it/s, training_loss=0.041]
Epoch 1:  31%|███       | 12741/41242 [41:22<1:29:37,  5.30it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12742/41242 [41:22<1:30:29,  5.25it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12742/41242 [41:22<1:30:29,  5.25it/s, training_loss=0.632]
Epoch 1:  31%|███       | 12743/41242 [41:22<1:31:10,  5.21it/s, training_loss=0.632]
Epoch 1:  31%|███       | 12743/41242 [41:22<1:31:10,  5.21it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12744/41242 [41:22<1:31:20,  5.20it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12744/41242 [41:22<1:31:20,  5.20it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12745/41242 [41:22<1:32:08,  5.15it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12745/41242 [41:23<1:32:08,  5.15it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12746/41242 [41:23<1:31:08,  5.21it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12746/41242 [41:23<1:31:08,  5.21it/s, training_loss=0.020]
Epoch 1:  31%|███       | 12747/41242 [41:23<1:32:58,  5.11it/s, training_loss=0.020]
Epoch 1:  31%|███       | 12747/41242 [41:23<1:32:58,  5.11it/s, training_loss=0.014]
Epoch 1:  31%|███       | 12748/41242 [41:23<1:31:27,  5.19it/s, training_loss=0.014]
Epoch 1:  31%|███       | 12748/41242 [41:23<1:31:27,  5.19it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12749/41242 [41:23<1:30:42,  5.24it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12749/41242 [41:23<1:30:42,  5.24it/s, training_loss=0.011]
Epoch 1:  31%|███       | 12750/41242 [41:23<1:31:43,  5.18it/s, training_loss=0.011]
Epoch 1:  31%|███       | 12750/41242 [41:24<1:31:43,  5.18it/s, training_loss=0.090]
Epoch 1:  31%|███       | 12751/41242 [41:24<1:32:11,  5.15it/s, training_loss=0.090]
Epoch 1:  31%|███       | 12751/41242 [41:24<1:32:11,  5.15it/s, training_loss=0.037]
Epoch 1:  31%|███       | 12752/41242 [41:24<1:31:55,  5.17it/s, training_loss=0.037]
Epoch 1:  31%|███       | 12752/41242 [41:24<1:31:55,  5.17it/s, training_loss=0.009]
Epoch 1:  31%|███       | 12753/41242 [41:24<1:31:05,  5.21it/s, training_loss=0.009]
Epoch 1:  31%|███       | 12753/41242 [41:24<1:31:05,  5.21it/s, training_loss=0.028]
Epoch 1:  31%|███       | 12754/41242 [41:24<1:31:15,  5.20it/s, training_loss=0.028]
Epoch 1:  31%|███       | 12754/41242 [41:24<1:31:15,  5.20it/s, training_loss=0.015]
Epoch 1:  31%|███       | 12755/41242 [41:24<1:31:15,  5.20it/s, training_loss=0.015]
Epoch 1:  31%|███       | 12755/41242 [41:25<1:31:15,  5.20it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12756/41242 [41:25<1:31:05,  5.21it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12756/41242 [41:25<1:31:05,  5.21it/s, training_loss=0.045]
Epoch 1:  31%|███       | 12757/41242 [41:25<1:30:32,  5.24it/s, training_loss=0.045]
Epoch 1:  31%|███       | 12757/41242 [41:25<1:30:32,  5.24it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12758/41242 [41:25<1:29:27,  5.31it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12758/41242 [41:25<1:29:27,  5.31it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12759/41242 [41:25<1:29:13,  5.32it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12759/41242 [41:25<1:29:13,  5.32it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12760/41242 [41:25<1:28:41,  5.35it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12760/41242 [41:25<1:28:41,  5.35it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12761/41242 [41:25<1:28:38,  5.35it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12761/41242 [41:26<1:28:38,  5.35it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12762/41242 [41:26<1:29:10,  5.32it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12762/41242 [41:26<1:29:10,  5.32it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12763/41242 [41:26<1:30:19,  5.26it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12763/41242 [41:26<1:30:19,  5.26it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12764/41242 [41:26<1:30:07,  5.27it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12764/41242 [41:26<1:30:07,  5.27it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12765/41242 [41:26<1:30:33,  5.24it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12765/41242 [41:26<1:30:33,  5.24it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12766/41242 [41:26<1:30:53,  5.22it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12766/41242 [41:27<1:30:53,  5.22it/s, training_loss=0.017]
Epoch 1:  31%|███       | 12767/41242 [41:27<1:30:45,  5.23it/s, training_loss=0.017]
Epoch 1:  31%|███       | 12767/41242 [41:27<1:30:45,  5.23it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12768/41242 [41:27<1:30:34,  5.24it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12768/41242 [41:27<1:30:34,  5.24it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12769/41242 [41:27<1:29:37,  5.29it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12769/41242 [41:27<1:29:37,  5.29it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12770/41242 [41:27<1:30:35,  5.24it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12770/41242 [41:27<1:30:35,  5.24it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12771/41242 [41:27<1:30:12,  5.26it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12771/41242 [41:28<1:30:12,  5.26it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12772/41242 [41:28<1:29:14,  5.32it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12772/41242 [41:28<1:29:14,  5.32it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12773/41242 [41:28<1:30:41,  5.23it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12773/41242 [41:28<1:30:41,  5.23it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12774/41242 [41:28<1:30:04,  5.27it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12774/41242 [41:28<1:30:04,  5.27it/s, training_loss=0.011]
Epoch 1:  31%|███       | 12775/41242 [41:28<1:29:45,  5.29it/s, training_loss=0.011]
Epoch 1:  31%|███       | 12775/41242 [41:28<1:29:45,  5.29it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12776/41242 [41:28<1:28:46,  5.34it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12776/41242 [41:28<1:28:46,  5.34it/s, training_loss=0.344]
Epoch 1:  31%|███       | 12777/41242 [41:29<1:29:11,  5.32it/s, training_loss=0.344]
Epoch 1:  31%|███       | 12777/41242 [41:29<1:29:11,  5.32it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12778/41242 [41:29<1:29:28,  5.30it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12778/41242 [41:29<1:29:28,  5.30it/s, training_loss=0.782]
Epoch 1:  31%|███       | 12779/41242 [41:29<1:29:14,  5.32it/s, training_loss=0.782]
Epoch 1:  31%|███       | 12779/41242 [41:29<1:29:14,  5.32it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12780/41242 [41:29<1:28:35,  5.35it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12780/41242 [41:29<1:28:35,  5.35it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12781/41242 [41:29<1:28:27,  5.36it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12781/41242 [41:29<1:28:27,  5.36it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12782/41242 [41:29<1:28:06,  5.38it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12782/41242 [41:30<1:28:06,  5.38it/s, training_loss=0.537]
Epoch 1:  31%|███       | 12783/41242 [41:30<1:30:11,  5.26it/s, training_loss=0.537]
Epoch 1:  31%|███       | 12783/41242 [41:30<1:30:11,  5.26it/s, training_loss=0.009]
Epoch 1:  31%|███       | 12784/41242 [41:30<1:32:03,  5.15it/s, training_loss=0.009]
Epoch 1:  31%|███       | 12784/41242 [41:30<1:32:03,  5.15it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12785/41242 [41:30<1:31:23,  5.19it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12785/41242 [41:30<1:31:23,  5.19it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12786/41242 [41:30<1:30:30,  5.24it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12786/41242 [41:30<1:30:30,  5.24it/s, training_loss=0.276]
Epoch 1:  31%|███       | 12787/41242 [41:30<1:30:18,  5.25it/s, training_loss=0.276]
Epoch 1:  31%|███       | 12787/41242 [41:31<1:30:18,  5.25it/s, training_loss=0.250]
Epoch 1:  31%|███       | 12788/41242 [41:31<1:30:36,  5.23it/s, training_loss=0.250]
Epoch 1:  31%|███       | 12788/41242 [41:31<1:30:36,  5.23it/s, training_loss=0.036]
Epoch 1:  31%|███       | 12789/41242 [41:31<1:31:28,  5.18it/s, training_loss=0.036]
Epoch 1:  31%|███       | 12789/41242 [41:31<1:31:28,  5.18it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12790/41242 [41:31<1:30:38,  5.23it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12790/41242 [41:31<1:30:38,  5.23it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12791/41242 [41:31<1:30:56,  5.21it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12791/41242 [41:31<1:30:56,  5.21it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12792/41242 [41:31<1:32:14,  5.14it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12792/41242 [41:32<1:32:14,  5.14it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12793/41242 [41:32<1:30:57,  5.21it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12793/41242 [41:32<1:30:57,  5.21it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12794/41242 [41:32<1:30:25,  5.24it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12794/41242 [41:32<1:30:25,  5.24it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12795/41242 [41:32<1:29:41,  5.29it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12795/41242 [41:32<1:29:41,  5.29it/s, training_loss=0.344]
Epoch 1:  31%|███       | 12796/41242 [41:32<1:29:13,  5.31it/s, training_loss=0.344]
Epoch 1:  31%|███       | 12796/41242 [41:32<1:29:13,  5.31it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12797/41242 [41:32<1:28:30,  5.36it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12797/41242 [41:32<1:28:30,  5.36it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12798/41242 [41:32<1:28:47,  5.34it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12798/41242 [41:33<1:28:47,  5.34it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12799/41242 [41:33<1:30:20,  5.25it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12799/41242 [41:33<1:30:20,  5.25it/s, training_loss=0.009]
Epoch 1:  31%|███       | 12800/41242 [41:33<1:31:55,  5.16it/s, training_loss=0.009]
Epoch 1:  31%|███       | 12800/41242 [41:33<1:31:55,  5.16it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12801/41242 [41:33<1:32:02,  5.15it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12801/41242 [41:33<1:32:02,  5.15it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12802/41242 [41:33<1:31:36,  5.17it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12802/41242 [41:33<1:31:36,  5.17it/s, training_loss=0.039]
Epoch 1:  31%|███       | 12803/41242 [41:33<1:31:00,  5.21it/s, training_loss=0.039]
Epoch 1:  31%|███       | 12803/41242 [41:34<1:31:00,  5.21it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12804/41242 [41:34<1:31:05,  5.20it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12804/41242 [41:34<1:31:05,  5.20it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12805/41242 [41:34<1:32:29,  5.12it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12805/41242 [41:34<1:32:29,  5.12it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12806/41242 [41:34<1:31:10,  5.20it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12806/41242 [41:34<1:31:10,  5.20it/s, training_loss=0.941]
Epoch 1:  31%|███       | 12807/41242 [41:34<1:30:19,  5.25it/s, training_loss=0.941]
Epoch 1:  31%|███       | 12807/41242 [41:34<1:30:19,  5.25it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12808/41242 [41:34<1:30:18,  5.25it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12808/41242 [41:35<1:30:18,  5.25it/s, training_loss=0.216]
Epoch 1:  31%|███       | 12809/41242 [41:35<1:31:36,  5.17it/s, training_loss=0.216]
Epoch 1:  31%|███       | 12809/41242 [41:35<1:31:36,  5.17it/s, training_loss=0.092]
Epoch 1:  31%|███       | 12810/41242 [41:35<1:32:19,  5.13it/s, training_loss=0.092]
Epoch 1:  31%|███       | 12810/41242 [41:35<1:32:19,  5.13it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12811/41242 [41:35<1:31:44,  5.16it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12811/41242 [41:35<1:31:44,  5.16it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12812/41242 [41:35<1:30:39,  5.23it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12812/41242 [41:35<1:30:39,  5.23it/s, training_loss=1.135]
Epoch 1:  31%|███       | 12813/41242 [41:35<1:30:02,  5.26it/s, training_loss=1.135]
Epoch 1:  31%|███       | 12813/41242 [41:36<1:30:02,  5.26it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12814/41242 [41:36<1:30:33,  5.23it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12814/41242 [41:36<1:30:33,  5.23it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12815/41242 [41:36<1:30:44,  5.22it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12815/41242 [41:36<1:30:44,  5.22it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12816/41242 [41:36<1:30:23,  5.24it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12816/41242 [41:36<1:30:23,  5.24it/s, training_loss=0.314]
Epoch 1:  31%|███       | 12817/41242 [41:36<1:30:36,  5.23it/s, training_loss=0.314]
Epoch 1:  31%|███       | 12817/41242 [41:36<1:30:36,  5.23it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12818/41242 [41:36<1:30:38,  5.23it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12818/41242 [41:37<1:30:38,  5.23it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12819/41242 [41:37<1:31:03,  5.20it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12819/41242 [41:37<1:31:03,  5.20it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12820/41242 [41:37<1:32:29,  5.12it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12820/41242 [41:37<1:32:29,  5.12it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12821/41242 [41:37<1:32:58,  5.09it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12821/41242 [41:37<1:32:58,  5.09it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12822/41242 [41:37<1:32:26,  5.12it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12822/41242 [41:37<1:32:26,  5.12it/s, training_loss=0.021]
Epoch 1:  31%|███       | 12823/41242 [41:37<1:32:10,  5.14it/s, training_loss=0.021]
Epoch 1:  31%|███       | 12823/41242 [41:38<1:32:10,  5.14it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12824/41242 [41:38<1:32:36,  5.11it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12824/41242 [41:38<1:32:36,  5.11it/s, training_loss=0.082]
Epoch 1:  31%|███       | 12825/41242 [41:38<1:32:57,  5.10it/s, training_loss=0.082]
Epoch 1:  31%|███       | 12825/41242 [41:38<1:32:57,  5.10it/s, training_loss=0.063]
Epoch 1:  31%|███       | 12826/41242 [41:38<1:33:44,  5.05it/s, training_loss=0.063]
Epoch 1:  31%|███       | 12826/41242 [41:38<1:33:44,  5.05it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12827/41242 [41:38<1:32:09,  5.14it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12827/41242 [41:38<1:32:09,  5.14it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12828/41242 [41:38<1:31:15,  5.19it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12828/41242 [41:38<1:31:15,  5.19it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12829/41242 [41:38<1:31:17,  5.19it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12829/41242 [41:39<1:31:17,  5.19it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12830/41242 [41:39<1:31:02,  5.20it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12830/41242 [41:39<1:31:02,  5.20it/s, training_loss=0.009]
Epoch 1:  31%|███       | 12831/41242 [41:39<1:30:05,  5.26it/s, training_loss=0.009]
Epoch 1:  31%|███       | 12831/41242 [41:39<1:30:05,  5.26it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12832/41242 [41:39<1:30:37,  5.22it/s, training_loss=0.005]
Epoch 1:  31%|███       | 12832/41242 [41:39<1:30:37,  5.22it/s, training_loss=0.405]
Epoch 1:  31%|███       | 12833/41242 [41:39<1:31:45,  5.16it/s, training_loss=0.405]
Epoch 1:  31%|███       | 12833/41242 [41:39<1:31:45,  5.16it/s, training_loss=0.733]
Epoch 1:  31%|███       | 12834/41242 [41:39<1:31:33,  5.17it/s, training_loss=0.733]
Epoch 1:  31%|███       | 12834/41242 [41:40<1:31:33,  5.17it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12835/41242 [41:40<1:30:49,  5.21it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12835/41242 [41:40<1:30:49,  5.21it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12836/41242 [41:40<1:29:50,  5.27it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12836/41242 [41:40<1:29:50,  5.27it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12837/41242 [41:40<1:30:24,  5.24it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12837/41242 [41:40<1:30:24,  5.24it/s, training_loss=0.823]
Epoch 1:  31%|███       | 12838/41242 [41:40<1:31:15,  5.19it/s, training_loss=0.823]
Epoch 1:  31%|███       | 12838/41242 [41:40<1:31:15,  5.19it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12839/41242 [41:40<1:31:12,  5.19it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12839/41242 [41:41<1:31:12,  5.19it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12840/41242 [41:41<1:31:25,  5.18it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12840/41242 [41:41<1:31:25,  5.18it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12841/41242 [41:41<1:32:11,  5.13it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12841/41242 [41:41<1:32:11,  5.13it/s, training_loss=0.022]
Epoch 1:  31%|███       | 12842/41242 [41:41<1:32:04,  5.14it/s, training_loss=0.022]
Epoch 1:  31%|███       | 12842/41242 [41:41<1:32:04,  5.14it/s, training_loss=0.222]
Epoch 1:  31%|███       | 12843/41242 [41:41<1:33:04,  5.09it/s, training_loss=0.222]
Epoch 1:  31%|███       | 12843/41242 [41:41<1:33:04,  5.09it/s, training_loss=0.009]
Epoch 1:  31%|███       | 12844/41242 [41:41<1:34:04,  5.03it/s, training_loss=0.009]
Epoch 1:  31%|███       | 12844/41242 [41:42<1:34:04,  5.03it/s, training_loss=0.013]
Epoch 1:  31%|███       | 12845/41242 [41:42<1:34:14,  5.02it/s, training_loss=0.013]
Epoch 1:  31%|███       | 12845/41242 [41:42<1:34:14,  5.02it/s, training_loss=0.049]
Epoch 1:  31%|███       | 12846/41242 [41:42<1:34:15,  5.02it/s, training_loss=0.049]
Epoch 1:  31%|███       | 12846/41242 [41:42<1:34:15,  5.02it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12847/41242 [41:42<1:33:27,  5.06it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12847/41242 [41:42<1:33:27,  5.06it/s, training_loss=0.029]
Epoch 1:  31%|███       | 12848/41242 [41:42<1:34:13,  5.02it/s, training_loss=0.029]
Epoch 1:  31%|███       | 12848/41242 [41:42<1:34:13,  5.02it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12849/41242 [41:42<1:33:00,  5.09it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12849/41242 [41:43<1:33:00,  5.09it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12850/41242 [41:43<1:32:06,  5.14it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12850/41242 [41:43<1:32:06,  5.14it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12851/41242 [41:43<1:31:39,  5.16it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12851/41242 [41:43<1:31:39,  5.16it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12852/41242 [41:43<1:33:46,  5.05it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12852/41242 [41:43<1:33:46,  5.05it/s, training_loss=0.087]
Epoch 1:  31%|███       | 12853/41242 [41:43<1:33:46,  5.05it/s, training_loss=0.087]
Epoch 1:  31%|███       | 12853/41242 [41:43<1:33:46,  5.05it/s, training_loss=0.510]
Epoch 1:  31%|███       | 12854/41242 [41:43<1:33:14,  5.07it/s, training_loss=0.510]
Epoch 1:  31%|███       | 12854/41242 [41:44<1:33:14,  5.07it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12855/41242 [41:44<1:31:19,  5.18it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12855/41242 [41:44<1:31:19,  5.18it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12856/41242 [41:44<1:32:07,  5.14it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12856/41242 [41:44<1:32:07,  5.14it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12857/41242 [41:44<1:31:09,  5.19it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12857/41242 [41:44<1:31:09,  5.19it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12858/41242 [41:44<1:29:59,  5.26it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12858/41242 [41:44<1:29:59,  5.26it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12859/41242 [41:44<1:28:36,  5.34it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12859/41242 [41:44<1:28:36,  5.34it/s, training_loss=0.563]
Epoch 1:  31%|███       | 12860/41242 [41:44<1:28:33,  5.34it/s, training_loss=0.563]
Epoch 1:  31%|███       | 12860/41242 [41:45<1:28:33,  5.34it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12861/41242 [41:45<1:27:55,  5.38it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12861/41242 [41:45<1:27:55,  5.38it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12862/41242 [41:45<1:28:35,  5.34it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12862/41242 [41:45<1:28:35,  5.34it/s, training_loss=0.067]
Epoch 1:  31%|███       | 12863/41242 [41:45<1:28:58,  5.32it/s, training_loss=0.067]
Epoch 1:  31%|███       | 12863/41242 [41:45<1:28:58,  5.32it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12864/41242 [41:45<1:28:18,  5.36it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12864/41242 [41:45<1:28:18,  5.36it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12865/41242 [41:45<1:27:58,  5.38it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12865/41242 [41:46<1:27:58,  5.38it/s, training_loss=0.019]
Epoch 1:  31%|███       | 12866/41242 [41:46<1:28:00,  5.37it/s, training_loss=0.019]
Epoch 1:  31%|███       | 12866/41242 [41:46<1:28:00,  5.37it/s, training_loss=0.022]
Epoch 1:  31%|███       | 12867/41242 [41:46<1:30:04,  5.25it/s, training_loss=0.022]
Epoch 1:  31%|███       | 12867/41242 [41:46<1:30:04,  5.25it/s, training_loss=0.011]
Epoch 1:  31%|███       | 12868/41242 [41:46<1:31:03,  5.19it/s, training_loss=0.011]
Epoch 1:  31%|███       | 12868/41242 [41:46<1:31:03,  5.19it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12869/41242 [41:46<1:29:59,  5.25it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12869/41242 [41:46<1:29:59,  5.25it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12870/41242 [41:46<1:29:38,  5.28it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12870/41242 [41:47<1:29:38,  5.28it/s, training_loss=0.012]
Epoch 1:  31%|███       | 12871/41242 [41:47<1:30:32,  5.22it/s, training_loss=0.012]
Epoch 1:  31%|███       | 12871/41242 [41:47<1:30:32,  5.22it/s, training_loss=0.007]
Epoch 1:  31%|███       | 12872/41242 [41:47<1:29:47,  5.27it/s, training_loss=0.007]
Epoch 1:  31%|███       | 12872/41242 [41:47<1:29:47,  5.27it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12873/41242 [41:47<1:29:37,  5.28it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12873/41242 [41:47<1:29:37,  5.28it/s, training_loss=0.009]
Epoch 1:  31%|███       | 12874/41242 [41:47<1:29:22,  5.29it/s, training_loss=0.009]
Epoch 1:  31%|███       | 12874/41242 [41:47<1:29:22,  5.29it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12875/41242 [41:47<1:29:22,  5.29it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12875/41242 [41:48<1:29:22,  5.29it/s, training_loss=0.014]
Epoch 1:  31%|███       | 12876/41242 [41:48<1:29:09,  5.30it/s, training_loss=0.014]
Epoch 1:  31%|███       | 12876/41242 [41:48<1:29:09,  5.30it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12877/41242 [41:48<1:29:06,  5.31it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12877/41242 [41:48<1:29:06,  5.31it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12878/41242 [41:48<1:29:13,  5.30it/s, training_loss=0.004]
Epoch 1:  31%|███       | 12878/41242 [41:48<1:29:13,  5.30it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12879/41242 [41:48<1:29:01,  5.31it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12879/41242 [41:48<1:29:01,  5.31it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12880/41242 [41:48<1:29:21,  5.29it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12880/41242 [41:48<1:29:21,  5.29it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12881/41242 [41:48<1:29:09,  5.30it/s, training_loss=0.006]
Epoch 1:  31%|███       | 12881/41242 [41:49<1:29:09,  5.30it/s, training_loss=0.867]
Epoch 1:  31%|███       | 12882/41242 [41:49<1:29:09,  5.30it/s, training_loss=0.867]
Epoch 1:  31%|███       | 12882/41242 [41:49<1:29:09,  5.30it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12883/41242 [41:49<1:28:42,  5.33it/s, training_loss=0.002]
Epoch 1:  31%|███       | 12883/41242 [41:49<1:28:42,  5.33it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12884/41242 [41:49<1:28:15,  5.36it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12884/41242 [41:49<1:28:15,  5.36it/s, training_loss=0.028]
Epoch 1:  31%|███       | 12885/41242 [41:49<1:28:39,  5.33it/s, training_loss=0.028]
Epoch 1:  31%|███       | 12885/41242 [41:49<1:28:39,  5.33it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12886/41242 [41:49<1:28:18,  5.35it/s, training_loss=0.001]
Epoch 1:  31%|███       | 12886/41242 [41:50<1:28:18,  5.35it/s, training_loss=0.103]
Epoch 1:  31%|███       | 12887/41242 [41:50<1:28:35,  5.33it/s, training_loss=0.103]
Epoch 1:  31%|███       | 12887/41242 [41:50<1:28:35,  5.33it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12888/41242 [41:50<1:29:33,  5.28it/s, training_loss=0.003]
Epoch 1:  31%|███       | 12888/41242 [41:50<1:29:33,  5.28it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12889/41242 [41:50<1:32:17,  5.12it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12889/41242 [41:50<1:32:17,  5.12it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12890/41242 [41:50<1:30:53,  5.20it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12890/41242 [41:50<1:30:53,  5.20it/s, training_loss=0.005]
Epoch 1:  31%|███▏      | 12891/41242 [41:50<1:31:02,  5.19it/s, training_loss=0.005]
Epoch 1:  31%|███▏      | 12891/41242 [41:51<1:31:02,  5.19it/s, training_loss=0.001]
Epoch 1:  31%|███▏      | 12892/41242 [41:51<1:30:35,  5.22it/s, training_loss=0.001]
Epoch 1:  31%|███▏      | 12892/41242 [41:51<1:30:35,  5.22it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12893/41242 [41:51<1:30:04,  5.25it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12893/41242 [41:51<1:30:04,  5.25it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12894/41242 [41:51<1:31:16,  5.18it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12894/41242 [41:51<1:31:16,  5.18it/s, training_loss=0.009]
Epoch 1:  31%|███▏      | 12895/41242 [41:51<1:30:15,  5.23it/s, training_loss=0.009]
Epoch 1:  31%|███▏      | 12895/41242 [41:51<1:30:15,  5.23it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12896/41242 [41:51<1:31:15,  5.18it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12896/41242 [41:52<1:31:15,  5.18it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12897/41242 [41:52<1:29:37,  5.27it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12897/41242 [41:52<1:29:37,  5.27it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12898/41242 [41:52<1:28:51,  5.32it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12898/41242 [41:52<1:28:51,  5.32it/s, training_loss=0.027]
Epoch 1:  31%|███▏      | 12899/41242 [41:52<1:28:11,  5.36it/s, training_loss=0.027]
Epoch 1:  31%|███▏      | 12899/41242 [41:52<1:28:11,  5.36it/s, training_loss=0.297]
Epoch 1:  31%|███▏      | 12900/41242 [41:52<1:28:27,  5.34it/s, training_loss=0.297]
Epoch 1:  31%|███▏      | 12900/41242 [41:52<1:28:27,  5.34it/s, training_loss=0.283]
Epoch 1:  31%|███▏      | 12901/41242 [41:52<1:28:41,  5.33it/s, training_loss=0.283]
Epoch 1:  31%|███▏      | 12901/41242 [41:52<1:28:41,  5.33it/s, training_loss=0.012]
Epoch 1:  31%|███▏      | 12902/41242 [41:52<1:30:31,  5.22it/s, training_loss=0.012]
Epoch 1:  31%|███▏      | 12902/41242 [41:53<1:30:31,  5.22it/s, training_loss=0.005]
Epoch 1:  31%|███▏      | 12903/41242 [41:53<1:30:17,  5.23it/s, training_loss=0.005]
Epoch 1:  31%|███▏      | 12903/41242 [41:53<1:30:17,  5.23it/s, training_loss=0.290]
Epoch 1:  31%|███▏      | 12904/41242 [41:53<1:31:20,  5.17it/s, training_loss=0.290]
Epoch 1:  31%|███▏      | 12904/41242 [41:53<1:31:20,  5.17it/s, training_loss=0.001]
Epoch 1:  31%|███▏      | 12905/41242 [41:53<1:30:56,  5.19it/s, training_loss=0.001]
Epoch 1:  31%|███▏      | 12905/41242 [41:53<1:30:56,  5.19it/s, training_loss=0.011]
Epoch 1:  31%|███▏      | 12906/41242 [41:53<1:31:30,  5.16it/s, training_loss=0.011]
Epoch 1:  31%|███▏      | 12906/41242 [41:53<1:31:30,  5.16it/s, training_loss=0.079]
Epoch 1:  31%|███▏      | 12907/41242 [41:53<1:32:07,  5.13it/s, training_loss=0.079]
Epoch 1:  31%|███▏      | 12907/41242 [41:54<1:32:07,  5.13it/s, training_loss=0.005]
Epoch 1:  31%|███▏      | 12908/41242 [41:54<1:32:19,  5.12it/s, training_loss=0.005]
Epoch 1:  31%|███▏      | 12908/41242 [41:54<1:32:19,  5.12it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12909/41242 [41:54<1:31:51,  5.14it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12909/41242 [41:54<1:31:51,  5.14it/s, training_loss=0.001]
Epoch 1:  31%|███▏      | 12910/41242 [41:54<1:30:50,  5.20it/s, training_loss=0.001]
Epoch 1:  31%|███▏      | 12910/41242 [41:54<1:30:50,  5.20it/s, training_loss=0.001]
Epoch 1:  31%|███▏      | 12911/41242 [41:54<1:30:54,  5.19it/s, training_loss=0.001]
Epoch 1:  31%|███▏      | 12911/41242 [41:54<1:30:54,  5.19it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12912/41242 [41:54<1:30:19,  5.23it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12912/41242 [41:55<1:30:19,  5.23it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12913/41242 [41:55<1:29:42,  5.26it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12913/41242 [41:55<1:29:42,  5.26it/s, training_loss=0.007]
Epoch 1:  31%|███▏      | 12914/41242 [41:55<1:29:33,  5.27it/s, training_loss=0.007]
Epoch 1:  31%|███▏      | 12914/41242 [41:55<1:29:33,  5.27it/s, training_loss=0.082]
Epoch 1:  31%|███▏      | 12915/41242 [41:55<1:28:58,  5.31it/s, training_loss=0.082]
Epoch 1:  31%|███▏      | 12915/41242 [41:55<1:28:58,  5.31it/s, training_loss=0.071]
Epoch 1:  31%|███▏      | 12916/41242 [41:55<1:28:56,  5.31it/s, training_loss=0.071]
Epoch 1:  31%|███▏      | 12916/41242 [41:55<1:28:56,  5.31it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12917/41242 [41:55<1:29:49,  5.26it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12917/41242 [41:56<1:29:49,  5.26it/s, training_loss=0.007]
Epoch 1:  31%|███▏      | 12918/41242 [41:56<1:29:49,  5.26it/s, training_loss=0.007]
Epoch 1:  31%|███▏      | 12918/41242 [41:56<1:29:49,  5.26it/s, training_loss=0.021]
Epoch 1:  31%|███▏      | 12919/41242 [41:56<1:30:44,  5.20it/s, training_loss=0.021]
Epoch 1:  31%|███▏      | 12919/41242 [41:56<1:30:44,  5.20it/s, training_loss=0.003]
Epoch 1:  31%|███▏      | 12920/41242 [41:56<1:30:20,  5.23it/s, training_loss=0.003]
Epoch 1:  31%|███▏      | 12920/41242 [41:56<1:30:20,  5.23it/s, training_loss=0.031]
Epoch 1:  31%|███▏      | 12921/41242 [41:56<1:30:21,  5.22it/s, training_loss=0.031]
Epoch 1:  31%|███▏      | 12921/41242 [41:56<1:30:21,  5.22it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12922/41242 [41:56<1:29:29,  5.27it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12922/41242 [41:56<1:29:29,  5.27it/s, training_loss=0.003]
Epoch 1:  31%|███▏      | 12923/41242 [41:56<1:28:31,  5.33it/s, training_loss=0.003]
Epoch 1:  31%|███▏      | 12923/41242 [41:57<1:28:31,  5.33it/s, training_loss=0.257]
Epoch 1:  31%|███▏      | 12924/41242 [41:57<1:29:38,  5.26it/s, training_loss=0.257]
Epoch 1:  31%|███▏      | 12924/41242 [41:57<1:29:38,  5.26it/s, training_loss=0.184]
Epoch 1:  31%|███▏      | 12925/41242 [41:57<1:30:26,  5.22it/s, training_loss=0.184]
Epoch 1:  31%|███▏      | 12925/41242 [41:57<1:30:26,  5.22it/s, training_loss=0.001]
Epoch 1:  31%|███▏      | 12926/41242 [41:57<1:30:40,  5.20it/s, training_loss=0.001]
Epoch 1:  31%|███▏      | 12926/41242 [41:57<1:30:40,  5.20it/s, training_loss=0.005]
Epoch 1:  31%|███▏      | 12927/41242 [41:57<1:29:11,  5.29it/s, training_loss=0.005]
Epoch 1:  31%|███▏      | 12927/41242 [41:57<1:29:11,  5.29it/s, training_loss=0.005]
Epoch 1:  31%|███▏      | 12928/41242 [41:57<1:30:05,  5.24it/s, training_loss=0.005]
Epoch 1:  31%|███▏      | 12928/41242 [41:58<1:30:05,  5.24it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12929/41242 [41:58<1:29:55,  5.25it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12929/41242 [41:58<1:29:55,  5.25it/s, training_loss=0.017]
Epoch 1:  31%|███▏      | 12930/41242 [41:58<1:30:29,  5.21it/s, training_loss=0.017]
Epoch 1:  31%|███▏      | 12930/41242 [41:58<1:30:29,  5.21it/s, training_loss=0.001]
Epoch 1:  31%|███▏      | 12931/41242 [41:58<1:30:32,  5.21it/s, training_loss=0.001]
Epoch 1:  31%|███▏      | 12931/41242 [41:58<1:30:32,  5.21it/s, training_loss=0.047]
Epoch 1:  31%|███▏      | 12932/41242 [41:58<1:30:41,  5.20it/s, training_loss=0.047]
Epoch 1:  31%|███▏      | 12932/41242 [41:58<1:30:41,  5.20it/s, training_loss=0.163]
Epoch 1:  31%|███▏      | 12933/41242 [41:58<1:31:52,  5.14it/s, training_loss=0.163]
Epoch 1:  31%|███▏      | 12933/41242 [41:59<1:31:52,  5.14it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12934/41242 [41:59<1:31:19,  5.17it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12934/41242 [41:59<1:31:19,  5.17it/s, training_loss=0.006]
Epoch 1:  31%|███▏      | 12935/41242 [41:59<1:30:33,  5.21it/s, training_loss=0.006]
Epoch 1:  31%|███▏      | 12935/41242 [41:59<1:30:33,  5.21it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12936/41242 [41:59<1:29:28,  5.27it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12936/41242 [41:59<1:29:28,  5.27it/s, training_loss=0.003]
Epoch 1:  31%|███▏      | 12937/41242 [41:59<1:28:37,  5.32it/s, training_loss=0.003]
Epoch 1:  31%|███▏      | 12937/41242 [41:59<1:28:37,  5.32it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12938/41242 [41:59<1:28:37,  5.32it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12938/41242 [42:00<1:28:37,  5.32it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12939/41242 [42:00<1:28:06,  5.35it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12939/41242 [42:00<1:28:06,  5.35it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12940/41242 [42:00<1:28:24,  5.34it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12940/41242 [42:00<1:28:24,  5.34it/s, training_loss=0.012]
Epoch 1:  31%|███▏      | 12941/41242 [42:00<1:29:43,  5.26it/s, training_loss=0.012]
Epoch 1:  31%|███▏      | 12941/41242 [42:00<1:29:43,  5.26it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12942/41242 [42:00<1:29:29,  5.27it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12942/41242 [42:00<1:29:29,  5.27it/s, training_loss=0.001]
Epoch 1:  31%|███▏      | 12943/41242 [42:00<1:28:36,  5.32it/s, training_loss=0.001]
Epoch 1:  31%|███▏      | 12943/41242 [42:00<1:28:36,  5.32it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12944/41242 [42:00<1:28:43,  5.32it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12944/41242 [42:01<1:28:43,  5.32it/s, training_loss=0.008]
Epoch 1:  31%|███▏      | 12945/41242 [42:01<1:28:05,  5.35it/s, training_loss=0.008]
Epoch 1:  31%|███▏      | 12945/41242 [42:01<1:28:05,  5.35it/s, training_loss=0.304]
Epoch 1:  31%|███▏      | 12946/41242 [42:01<1:28:31,  5.33it/s, training_loss=0.304]
Epoch 1:  31%|███▏      | 12946/41242 [42:01<1:28:31,  5.33it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12947/41242 [42:01<1:28:41,  5.32it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12947/41242 [42:01<1:28:41,  5.32it/s, training_loss=0.089]
Epoch 1:  31%|███▏      | 12948/41242 [42:01<1:30:03,  5.24it/s, training_loss=0.089]
Epoch 1:  31%|███▏      | 12948/41242 [42:01<1:30:03,  5.24it/s, training_loss=0.003]
Epoch 1:  31%|███▏      | 12949/41242 [42:01<1:29:37,  5.26it/s, training_loss=0.003]
Epoch 1:  31%|███▏      | 12949/41242 [42:02<1:29:37,  5.26it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12950/41242 [42:02<1:28:48,  5.31it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12950/41242 [42:02<1:28:48,  5.31it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12951/41242 [42:02<1:27:33,  5.39it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12951/41242 [42:02<1:27:33,  5.39it/s, training_loss=0.329]
Epoch 1:  31%|███▏      | 12952/41242 [42:02<1:28:05,  5.35it/s, training_loss=0.329]
Epoch 1:  31%|███▏      | 12952/41242 [42:02<1:28:05,  5.35it/s, training_loss=0.632]
Epoch 1:  31%|███▏      | 12953/41242 [42:02<1:28:16,  5.34it/s, training_loss=0.632]
Epoch 1:  31%|███▏      | 12953/41242 [42:02<1:28:16,  5.34it/s, training_loss=1.108]
Epoch 1:  31%|███▏      | 12954/41242 [42:02<1:28:37,  5.32it/s, training_loss=1.108]
Epoch 1:  31%|███▏      | 12954/41242 [42:03<1:28:37,  5.32it/s, training_loss=0.027]
Epoch 1:  31%|███▏      | 12955/41242 [42:03<1:28:25,  5.33it/s, training_loss=0.027]
Epoch 1:  31%|███▏      | 12955/41242 [42:03<1:28:25,  5.33it/s, training_loss=0.054]
Epoch 1:  31%|███▏      | 12956/41242 [42:03<1:28:58,  5.30it/s, training_loss=0.054]
Epoch 1:  31%|███▏      | 12956/41242 [42:03<1:28:58,  5.30it/s, training_loss=0.024]
Epoch 1:  31%|███▏      | 12957/41242 [42:03<1:30:47,  5.19it/s, training_loss=0.024]
Epoch 1:  31%|███▏      | 12957/41242 [42:03<1:30:47,  5.19it/s, training_loss=0.031]
Epoch 1:  31%|███▏      | 12958/41242 [42:03<1:31:32,  5.15it/s, training_loss=0.031]
Epoch 1:  31%|███▏      | 12958/41242 [42:03<1:31:32,  5.15it/s, training_loss=0.001]
Epoch 1:  31%|███▏      | 12959/41242 [42:03<1:31:13,  5.17it/s, training_loss=0.001]
Epoch 1:  31%|███▏      | 12959/41242 [42:03<1:31:13,  5.17it/s, training_loss=0.074]
Epoch 1:  31%|███▏      | 12960/41242 [42:04<1:30:59,  5.18it/s, training_loss=0.074]
Epoch 1:  31%|███▏      | 12960/41242 [42:04<1:30:59,  5.18it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12961/41242 [42:04<1:29:33,  5.26it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12961/41242 [42:04<1:29:33,  5.26it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12962/41242 [42:04<1:28:53,  5.30it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12962/41242 [42:04<1:28:53,  5.30it/s, training_loss=0.003]
Epoch 1:  31%|███▏      | 12963/41242 [42:04<1:28:17,  5.34it/s, training_loss=0.003]
Epoch 1:  31%|███▏      | 12963/41242 [42:04<1:28:17,  5.34it/s, training_loss=0.003]
Epoch 1:  31%|███▏      | 12964/41242 [42:04<1:27:58,  5.36it/s, training_loss=0.003]
Epoch 1:  31%|███▏      | 12964/41242 [42:04<1:27:58,  5.36it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12965/41242 [42:04<1:28:17,  5.34it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12965/41242 [42:05<1:28:17,  5.34it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12966/41242 [42:05<1:29:15,  5.28it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12966/41242 [42:05<1:29:15,  5.28it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12967/41242 [42:05<1:29:43,  5.25it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12967/41242 [42:05<1:29:43,  5.25it/s, training_loss=0.532]
Epoch 1:  31%|███▏      | 12968/41242 [42:05<1:30:13,  5.22it/s, training_loss=0.532]
Epoch 1:  31%|███▏      | 12968/41242 [42:05<1:30:13,  5.22it/s, training_loss=0.005]
Epoch 1:  31%|███▏      | 12969/41242 [42:05<1:30:45,  5.19it/s, training_loss=0.005]
Epoch 1:  31%|███▏      | 12969/41242 [42:05<1:30:45,  5.19it/s, training_loss=0.003]
Epoch 1:  31%|███▏      | 12970/41242 [42:05<1:30:01,  5.23it/s, training_loss=0.003]
Epoch 1:  31%|███▏      | 12970/41242 [42:06<1:30:01,  5.23it/s, training_loss=0.001]
Epoch 1:  31%|███▏      | 12971/41242 [42:06<1:29:18,  5.28it/s, training_loss=0.001]
Epoch 1:  31%|███▏      | 12971/41242 [42:06<1:29:18,  5.28it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12972/41242 [42:06<1:29:02,  5.29it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12972/41242 [42:06<1:29:02,  5.29it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12973/41242 [42:06<1:28:47,  5.31it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12973/41242 [42:06<1:28:47,  5.31it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12974/41242 [42:06<1:28:32,  5.32it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12974/41242 [42:06<1:28:32,  5.32it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12975/41242 [42:06<1:27:59,  5.35it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12975/41242 [42:07<1:27:59,  5.35it/s, training_loss=0.005]
Epoch 1:  31%|███▏      | 12976/41242 [42:07<1:27:58,  5.35it/s, training_loss=0.005]
Epoch 1:  31%|███▏      | 12976/41242 [42:07<1:27:58,  5.35it/s, training_loss=0.044]
Epoch 1:  31%|███▏      | 12977/41242 [42:07<1:28:19,  5.33it/s, training_loss=0.044]
Epoch 1:  31%|███▏      | 12977/41242 [42:07<1:28:19,  5.33it/s, training_loss=0.003]
Epoch 1:  31%|███▏      | 12978/41242 [42:07<1:27:45,  5.37it/s, training_loss=0.003]
Epoch 1:  31%|███▏      | 12978/41242 [42:07<1:27:45,  5.37it/s, training_loss=0.003]
Epoch 1:  31%|███▏      | 12979/41242 [42:07<1:28:02,  5.35it/s, training_loss=0.003]
Epoch 1:  31%|███▏      | 12979/41242 [42:07<1:28:02,  5.35it/s, training_loss=0.010]
Epoch 1:  31%|███▏      | 12980/41242 [42:07<1:28:26,  5.33it/s, training_loss=0.010]
Epoch 1:  31%|███▏      | 12980/41242 [42:07<1:28:26,  5.33it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12981/41242 [42:07<1:27:58,  5.35it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12981/41242 [42:08<1:27:58,  5.35it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12982/41242 [42:08<1:27:35,  5.38it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12982/41242 [42:08<1:27:35,  5.38it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12983/41242 [42:08<1:27:44,  5.37it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12983/41242 [42:08<1:27:44,  5.37it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12984/41242 [42:08<1:26:48,  5.43it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12984/41242 [42:08<1:26:48,  5.43it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12985/41242 [42:08<1:27:41,  5.37it/s, training_loss=0.004]
Epoch 1:  31%|███▏      | 12985/41242 [42:08<1:27:41,  5.37it/s, training_loss=0.307]
Epoch 1:  31%|███▏      | 12986/41242 [42:08<1:28:07,  5.34it/s, training_loss=0.307]
Epoch 1:  31%|███▏      | 12986/41242 [42:09<1:28:07,  5.34it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12987/41242 [42:09<1:28:20,  5.33it/s, training_loss=0.002]
Epoch 1:  31%|███▏      | 12987/41242 [42:09<1:28:20,  5.33it/s, training_loss=0.007]
Epoch 1:  31%|███▏      | 12988/41242 [42:09<1:28:28,  5.32it/s, training_loss=0.007]
Epoch 1:  31%|███▏      | 12988/41242 [42:09<1:28:28,  5.32it/s, training_loss=0.021]
Epoch 1:  31%|███▏      | 12989/41242 [42:09<1:28:34,  5.32it/s, training_loss=0.021]
Epoch 1:  31%|███▏      | 12989/41242 [42:09<1:28:34,  5.32it/s, training_loss=0.005]
Epoch 1:  31%|███▏      | 12990/41242 [42:09<1:28:57,  5.29it/s, training_loss=0.005]
Epoch 1:  31%|███▏      | 12990/41242 [42:09<1:28:57,  5.29it/s, training_loss=0.067]
Epoch 1:  31%|███▏      | 12991/41242 [42:09<1:30:18,  5.21it/s, training_loss=0.067]
Epoch 1:  31%|███▏      | 12991/41242 [42:10<1:30:18,  5.21it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 12992/41242 [42:10<1:31:24,  5.15it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 12992/41242 [42:10<1:31:24,  5.15it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 12993/41242 [42:10<1:31:28,  5.15it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 12993/41242 [42:10<1:31:28,  5.15it/s, training_loss=0.005]
Epoch 1:  32%|███▏      | 12994/41242 [42:10<1:32:01,  5.12it/s, training_loss=0.005]
Epoch 1:  32%|███▏      | 12994/41242 [42:10<1:32:01,  5.12it/s, training_loss=0.541]
Epoch 1:  32%|███▏      | 12995/41242 [42:10<1:31:59,  5.12it/s, training_loss=0.541]
Epoch 1:  32%|███▏      | 12995/41242 [42:10<1:31:59,  5.12it/s, training_loss=0.005]
Epoch 1:  32%|███▏      | 12996/41242 [42:10<1:31:13,  5.16it/s, training_loss=0.005]
Epoch 1:  32%|███▏      | 12996/41242 [42:10<1:31:13,  5.16it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 12997/41242 [42:10<1:30:05,  5.22it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 12997/41242 [42:11<1:30:05,  5.22it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 12998/41242 [42:11<1:28:33,  5.32it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 12998/41242 [42:11<1:28:33,  5.32it/s, training_loss=0.007]
Epoch 1:  32%|███▏      | 12999/41242 [42:11<1:27:44,  5.37it/s, training_loss=0.007]
Epoch 1:  32%|███▏      | 12999/41242 [42:11<1:27:44,  5.37it/s, training_loss=0.005]
Epoch 1:  32%|███▏      | 13000/41242 [42:11<1:28:08,  5.34it/s, training_loss=0.005]
Epoch 1:  32%|███▏      | 13000/41242 [42:11<1:28:08,  5.34it/s, training_loss=0.009]
Epoch 1:  32%|███▏      | 13001/41242 [42:11<1:30:28,  5.20it/s, training_loss=0.009]
Epoch 1:  32%|███▏      | 13001/41242 [42:11<1:30:28,  5.20it/s, training_loss=0.180]
Epoch 1:  32%|███▏      | 13002/41242 [42:11<1:30:33,  5.20it/s, training_loss=0.180]
Epoch 1:  32%|███▏      | 13002/41242 [42:12<1:30:33,  5.20it/s, training_loss=0.011]
Epoch 1:  32%|███▏      | 13003/41242 [42:12<1:29:57,  5.23it/s, training_loss=0.011]
Epoch 1:  32%|███▏      | 13003/41242 [42:12<1:29:57,  5.23it/s, training_loss=0.474]
Epoch 1:  32%|███▏      | 13004/41242 [42:12<1:29:46,  5.24it/s, training_loss=0.474]
Epoch 1:  32%|███▏      | 13004/41242 [42:12<1:29:46,  5.24it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13005/41242 [42:12<1:29:04,  5.28it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13005/41242 [42:12<1:29:04,  5.28it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13006/41242 [42:12<1:29:20,  5.27it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13006/41242 [42:12<1:29:20,  5.27it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13007/41242 [42:12<1:28:42,  5.30it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13007/41242 [42:13<1:28:42,  5.30it/s, training_loss=0.184]
Epoch 1:  32%|███▏      | 13008/41242 [42:13<1:28:17,  5.33it/s, training_loss=0.184]
Epoch 1:  32%|███▏      | 13008/41242 [42:13<1:28:17,  5.33it/s, training_loss=0.005]
Epoch 1:  32%|███▏      | 13009/41242 [42:13<1:27:58,  5.35it/s, training_loss=0.005]
Epoch 1:  32%|███▏      | 13009/41242 [42:13<1:27:58,  5.35it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13010/41242 [42:13<1:27:47,  5.36it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13010/41242 [42:13<1:27:47,  5.36it/s, training_loss=0.044]
Epoch 1:  32%|███▏      | 13011/41242 [42:13<1:28:57,  5.29it/s, training_loss=0.044]
Epoch 1:  32%|███▏      | 13011/41242 [42:13<1:28:57,  5.29it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13012/41242 [42:13<1:29:41,  5.25it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13012/41242 [42:14<1:29:41,  5.25it/s, training_loss=0.022]
Epoch 1:  32%|███▏      | 13013/41242 [42:14<1:31:14,  5.16it/s, training_loss=0.022]
Epoch 1:  32%|███▏      | 13013/41242 [42:14<1:31:14,  5.16it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13014/41242 [42:14<1:30:10,  5.22it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13014/41242 [42:14<1:30:10,  5.22it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13015/41242 [42:14<1:29:16,  5.27it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13015/41242 [42:14<1:29:16,  5.27it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13016/41242 [42:14<1:29:03,  5.28it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13016/41242 [42:14<1:29:03,  5.28it/s, training_loss=0.348]
Epoch 1:  32%|███▏      | 13017/41242 [42:14<1:30:34,  5.19it/s, training_loss=0.348]
Epoch 1:  32%|███▏      | 13017/41242 [42:14<1:30:34,  5.19it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13018/41242 [42:14<1:30:21,  5.21it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13018/41242 [42:15<1:30:21,  5.21it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13019/41242 [42:15<1:29:59,  5.23it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13019/41242 [42:15<1:29:59,  5.23it/s, training_loss=0.996]
Epoch 1:  32%|███▏      | 13020/41242 [42:15<1:31:10,  5.16it/s, training_loss=0.996]
Epoch 1:  32%|███▏      | 13020/41242 [42:15<1:31:10,  5.16it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13021/41242 [42:15<1:30:25,  5.20it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13021/41242 [42:15<1:30:25,  5.20it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13022/41242 [42:15<1:30:30,  5.20it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13022/41242 [42:15<1:30:30,  5.20it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13023/41242 [42:15<1:29:37,  5.25it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13023/41242 [42:16<1:29:37,  5.25it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13024/41242 [42:16<1:28:23,  5.32it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13024/41242 [42:16<1:28:23,  5.32it/s, training_loss=0.179]
Epoch 1:  32%|███▏      | 13025/41242 [42:16<1:29:37,  5.25it/s, training_loss=0.179]
Epoch 1:  32%|███▏      | 13025/41242 [42:16<1:29:37,  5.25it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13026/41242 [42:16<1:29:07,  5.28it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13026/41242 [42:16<1:29:07,  5.28it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13027/41242 [42:16<1:28:24,  5.32it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13027/41242 [42:16<1:28:24,  5.32it/s, training_loss=0.010]
Epoch 1:  32%|███▏      | 13028/41242 [42:16<1:29:56,  5.23it/s, training_loss=0.010]
Epoch 1:  32%|███▏      | 13028/41242 [42:17<1:29:56,  5.23it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13029/41242 [42:17<1:30:46,  5.18it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13029/41242 [42:17<1:30:46,  5.18it/s, training_loss=0.053]
Epoch 1:  32%|███▏      | 13030/41242 [42:17<1:31:05,  5.16it/s, training_loss=0.053]
Epoch 1:  32%|███▏      | 13030/41242 [42:17<1:31:05,  5.16it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13031/41242 [42:17<1:29:33,  5.25it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13031/41242 [42:17<1:29:33,  5.25it/s, training_loss=0.007]
Epoch 1:  32%|███▏      | 13032/41242 [42:17<1:31:01,  5.17it/s, training_loss=0.007]
Epoch 1:  32%|███▏      | 13032/41242 [42:17<1:31:01,  5.17it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13033/41242 [42:17<1:31:21,  5.15it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13033/41242 [42:18<1:31:21,  5.15it/s, training_loss=0.239]
Epoch 1:  32%|███▏      | 13034/41242 [42:18<1:30:58,  5.17it/s, training_loss=0.239]
Epoch 1:  32%|███▏      | 13034/41242 [42:18<1:30:58,  5.17it/s, training_loss=0.744]
Epoch 1:  32%|███▏      | 13035/41242 [42:18<1:32:43,  5.07it/s, training_loss=0.744]
Epoch 1:  32%|███▏      | 13035/41242 [42:18<1:32:43,  5.07it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13036/41242 [42:18<1:31:53,  5.12it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13036/41242 [42:18<1:31:53,  5.12it/s, training_loss=0.356]
Epoch 1:  32%|███▏      | 13037/41242 [42:18<1:31:48,  5.12it/s, training_loss=0.356]
Epoch 1:  32%|███▏      | 13037/41242 [42:18<1:31:48,  5.12it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13038/41242 [42:18<1:31:28,  5.14it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13038/41242 [42:19<1:31:28,  5.14it/s, training_loss=0.006]
Epoch 1:  32%|███▏      | 13039/41242 [42:19<1:30:50,  5.17it/s, training_loss=0.006]
Epoch 1:  32%|███▏      | 13039/41242 [42:19<1:30:50,  5.17it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13040/41242 [42:19<1:30:19,  5.20it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13040/41242 [42:19<1:30:19,  5.20it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13041/41242 [42:19<1:30:51,  5.17it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13041/41242 [42:19<1:30:51,  5.17it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13042/41242 [42:19<1:30:54,  5.17it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13042/41242 [42:19<1:30:54,  5.17it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13043/41242 [42:19<1:29:54,  5.23it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13043/41242 [42:19<1:29:54,  5.23it/s, training_loss=0.006]
Epoch 1:  32%|███▏      | 13044/41242 [42:19<1:30:16,  5.21it/s, training_loss=0.006]
Epoch 1:  32%|███▏      | 13044/41242 [42:20<1:30:16,  5.21it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13045/41242 [42:20<1:29:34,  5.25it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13045/41242 [42:20<1:29:34,  5.25it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13046/41242 [42:20<1:29:11,  5.27it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13046/41242 [42:20<1:29:11,  5.27it/s, training_loss=0.020]
Epoch 1:  32%|███▏      | 13047/41242 [42:20<1:30:29,  5.19it/s, training_loss=0.020]
Epoch 1:  32%|███▏      | 13047/41242 [42:20<1:30:29,  5.19it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13048/41242 [42:20<1:29:43,  5.24it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13048/41242 [42:20<1:29:43,  5.24it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13049/41242 [42:20<1:29:28,  5.25it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13049/41242 [42:21<1:29:28,  5.25it/s, training_loss=0.036]
Epoch 1:  32%|███▏      | 13050/41242 [42:21<1:29:16,  5.26it/s, training_loss=0.036]
Epoch 1:  32%|███▏      | 13050/41242 [42:21<1:29:16,  5.26it/s, training_loss=0.718]
Epoch 1:  32%|███▏      | 13051/41242 [42:21<1:30:44,  5.18it/s, training_loss=0.718]
Epoch 1:  32%|███▏      | 13051/41242 [42:21<1:30:44,  5.18it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13052/41242 [42:21<1:30:15,  5.21it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13052/41242 [42:21<1:30:15,  5.21it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13053/41242 [42:21<1:29:09,  5.27it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13053/41242 [42:21<1:29:09,  5.27it/s, training_loss=0.048]
Epoch 1:  32%|███▏      | 13054/41242 [42:21<1:29:04,  5.27it/s, training_loss=0.048]
Epoch 1:  32%|███▏      | 13054/41242 [42:22<1:29:04,  5.27it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13055/41242 [42:22<1:28:32,  5.31it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13055/41242 [42:22<1:28:32,  5.31it/s, training_loss=0.453]
Epoch 1:  32%|███▏      | 13056/41242 [42:22<1:28:38,  5.30it/s, training_loss=0.453]
Epoch 1:  32%|███▏      | 13056/41242 [42:22<1:28:38,  5.30it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13057/41242 [42:22<1:28:03,  5.33it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13057/41242 [42:22<1:28:03,  5.33it/s, training_loss=0.006]
Epoch 1:  32%|███▏      | 13058/41242 [42:22<1:28:56,  5.28it/s, training_loss=0.006]
Epoch 1:  32%|███▏      | 13058/41242 [42:22<1:28:56,  5.28it/s, training_loss=0.759]
Epoch 1:  32%|███▏      | 13059/41242 [42:22<1:29:58,  5.22it/s, training_loss=0.759]
Epoch 1:  32%|███▏      | 13059/41242 [42:23<1:29:58,  5.22it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13060/41242 [42:23<1:30:12,  5.21it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13060/41242 [42:23<1:30:12,  5.21it/s, training_loss=0.095]
Epoch 1:  32%|███▏      | 13061/41242 [42:23<1:31:02,  5.16it/s, training_loss=0.095]
Epoch 1:  32%|███▏      | 13061/41242 [42:23<1:31:02,  5.16it/s, training_loss=0.026]
Epoch 1:  32%|███▏      | 13062/41242 [42:23<1:31:02,  5.16it/s, training_loss=0.026]
Epoch 1:  32%|███▏      | 13062/41242 [42:23<1:31:02,  5.16it/s, training_loss=0.090]
Epoch 1:  32%|███▏      | 13063/41242 [42:23<1:30:38,  5.18it/s, training_loss=0.090]
Epoch 1:  32%|███▏      | 13063/41242 [42:23<1:30:38,  5.18it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13064/41242 [42:23<1:29:59,  5.22it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13064/41242 [42:23<1:29:59,  5.22it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13065/41242 [42:23<1:29:35,  5.24it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13065/41242 [42:24<1:29:35,  5.24it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13066/41242 [42:24<1:28:57,  5.28it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13066/41242 [42:24<1:28:57,  5.28it/s, training_loss=0.102]
Epoch 1:  32%|███▏      | 13067/41242 [42:24<1:29:37,  5.24it/s, training_loss=0.102]
Epoch 1:  32%|███▏      | 13067/41242 [42:24<1:29:37,  5.24it/s, training_loss=0.253]
Epoch 1:  32%|███▏      | 13068/41242 [42:24<1:30:18,  5.20it/s, training_loss=0.253]
Epoch 1:  32%|███▏      | 13068/41242 [42:24<1:30:18,  5.20it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13069/41242 [42:24<1:30:33,  5.18it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13069/41242 [42:24<1:30:33,  5.18it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13070/41242 [42:24<1:31:05,  5.15it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13070/41242 [42:25<1:31:05,  5.15it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13071/41242 [42:25<1:31:35,  5.13it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13071/41242 [42:25<1:31:35,  5.13it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13072/41242 [42:25<1:33:50,  5.00it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13072/41242 [42:25<1:33:50,  5.00it/s, training_loss=0.007]
Epoch 1:  32%|███▏      | 13073/41242 [42:25<1:32:52,  5.06it/s, training_loss=0.007]
Epoch 1:  32%|███▏      | 13073/41242 [42:25<1:32:52,  5.06it/s, training_loss=0.517]
Epoch 1:  32%|███▏      | 13074/41242 [42:25<1:32:02,  5.10it/s, training_loss=0.517]
Epoch 1:  32%|███▏      | 13074/41242 [42:25<1:32:02,  5.10it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13075/41242 [42:25<1:30:23,  5.19it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13075/41242 [42:26<1:30:23,  5.19it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13076/41242 [42:26<1:30:01,  5.21it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13076/41242 [42:26<1:30:01,  5.21it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13077/41242 [42:26<1:30:29,  5.19it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13077/41242 [42:26<1:30:29,  5.19it/s, training_loss=0.483]
Epoch 1:  32%|███▏      | 13078/41242 [42:26<1:30:48,  5.17it/s, training_loss=0.483]
Epoch 1:  32%|███▏      | 13078/41242 [42:26<1:30:48,  5.17it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13079/41242 [42:26<1:29:27,  5.25it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13079/41242 [42:26<1:29:27,  5.25it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13080/41242 [42:26<1:29:01,  5.27it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13080/41242 [42:27<1:29:01,  5.27it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13081/41242 [42:27<1:28:18,  5.32it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13081/41242 [42:27<1:28:18,  5.32it/s, training_loss=0.133]
Epoch 1:  32%|███▏      | 13082/41242 [42:27<1:28:35,  5.30it/s, training_loss=0.133]
Epoch 1:  32%|███▏      | 13082/41242 [42:27<1:28:35,  5.30it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13083/41242 [42:27<1:29:55,  5.22it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13083/41242 [42:27<1:29:55,  5.22it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13084/41242 [42:27<1:29:45,  5.23it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13084/41242 [42:27<1:29:45,  5.23it/s, training_loss=0.016]
Epoch 1:  32%|███▏      | 13085/41242 [42:27<1:29:11,  5.26it/s, training_loss=0.016]
Epoch 1:  32%|███▏      | 13085/41242 [42:28<1:29:11,  5.26it/s, training_loss=0.333]
Epoch 1:  32%|███▏      | 13086/41242 [42:28<1:29:11,  5.26it/s, training_loss=0.333]
Epoch 1:  32%|███▏      | 13086/41242 [42:28<1:29:11,  5.26it/s, training_loss=0.006]
Epoch 1:  32%|███▏      | 13087/41242 [42:28<1:30:00,  5.21it/s, training_loss=0.006]
Epoch 1:  32%|███▏      | 13087/41242 [42:28<1:30:00,  5.21it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13088/41242 [42:28<1:33:05,  5.04it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13088/41242 [42:28<1:33:05,  5.04it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13089/41242 [42:28<1:32:47,  5.06it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13089/41242 [42:28<1:32:47,  5.06it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13090/41242 [42:28<1:31:34,  5.12it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13090/41242 [42:29<1:31:34,  5.12it/s, training_loss=0.114]
Epoch 1:  32%|███▏      | 13091/41242 [42:29<1:31:29,  5.13it/s, training_loss=0.114]
Epoch 1:  32%|███▏      | 13091/41242 [42:29<1:31:29,  5.13it/s, training_loss=0.031]
Epoch 1:  32%|███▏      | 13092/41242 [42:29<1:31:56,  5.10it/s, training_loss=0.031]
Epoch 1:  32%|███▏      | 13092/41242 [42:29<1:31:56,  5.10it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13093/41242 [42:29<1:31:37,  5.12it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13093/41242 [42:29<1:31:37,  5.12it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13094/41242 [42:29<1:31:19,  5.14it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13094/41242 [42:29<1:31:19,  5.14it/s, training_loss=0.011]
Epoch 1:  32%|███▏      | 13095/41242 [42:29<1:30:55,  5.16it/s, training_loss=0.011]
Epoch 1:  32%|███▏      | 13095/41242 [42:29<1:30:55,  5.16it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13096/41242 [42:30<1:32:07,  5.09it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13096/41242 [42:30<1:32:07,  5.09it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13097/41242 [42:30<1:33:18,  5.03it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13097/41242 [42:30<1:33:18,  5.03it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13098/41242 [42:30<1:31:53,  5.10it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13098/41242 [42:30<1:31:53,  5.10it/s, training_loss=0.046]
Epoch 1:  32%|███▏      | 13099/41242 [42:30<1:30:31,  5.18it/s, training_loss=0.046]
Epoch 1:  32%|███▏      | 13099/41242 [42:30<1:30:31,  5.18it/s, training_loss=0.062]
Epoch 1:  32%|███▏      | 13100/41242 [42:30<1:29:32,  5.24it/s, training_loss=0.062]
Epoch 1:  32%|███▏      | 13100/41242 [42:30<1:29:32,  5.24it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13101/41242 [42:30<1:30:12,  5.20it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13101/41242 [42:31<1:30:12,  5.20it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13102/41242 [42:31<1:30:07,  5.20it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13102/41242 [42:31<1:30:07,  5.20it/s, training_loss=0.012]
Epoch 1:  32%|███▏      | 13103/41242 [42:31<1:30:55,  5.16it/s, training_loss=0.012]
Epoch 1:  32%|███▏      | 13103/41242 [42:31<1:30:55,  5.16it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13104/41242 [42:31<1:32:34,  5.07it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13104/41242 [42:31<1:32:34,  5.07it/s, training_loss=0.184]
Epoch 1:  32%|███▏      | 13105/41242 [42:31<1:36:59,  4.84it/s, training_loss=0.184]
Epoch 1:  32%|███▏      | 13105/41242 [42:31<1:36:59,  4.84it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13106/41242 [42:31<1:36:04,  4.88it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13106/41242 [42:32<1:36:04,  4.88it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13107/41242 [42:32<1:34:41,  4.95it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13107/41242 [42:32<1:34:41,  4.95it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13108/41242 [42:32<1:35:16,  4.92it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13108/41242 [42:32<1:35:16,  4.92it/s, training_loss=0.253]
Epoch 1:  32%|███▏      | 13109/41242 [42:32<1:34:52,  4.94it/s, training_loss=0.253]
Epoch 1:  32%|███▏      | 13109/41242 [42:32<1:34:52,  4.94it/s, training_loss=0.365]
Epoch 1:  32%|███▏      | 13110/41242 [42:32<1:35:50,  4.89it/s, training_loss=0.365]
Epoch 1:  32%|███▏      | 13110/41242 [42:32<1:35:50,  4.89it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13111/41242 [42:32<1:33:59,  4.99it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13111/41242 [42:33<1:33:59,  4.99it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13112/41242 [42:33<1:31:52,  5.10it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13112/41242 [42:33<1:31:52,  5.10it/s, training_loss=0.237]
Epoch 1:  32%|███▏      | 13113/41242 [42:33<1:30:53,  5.16it/s, training_loss=0.237]
Epoch 1:  32%|███▏      | 13113/41242 [42:33<1:30:53,  5.16it/s, training_loss=0.364]
Epoch 1:  32%|███▏      | 13114/41242 [42:33<1:31:20,  5.13it/s, training_loss=0.364]
Epoch 1:  32%|███▏      | 13114/41242 [42:33<1:31:20,  5.13it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13115/41242 [42:33<1:31:18,  5.13it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13115/41242 [42:33<1:31:18,  5.13it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13116/41242 [42:33<1:30:39,  5.17it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13116/41242 [42:34<1:30:39,  5.17it/s, training_loss=0.007]
Epoch 1:  32%|███▏      | 13117/41242 [42:34<1:30:14,  5.19it/s, training_loss=0.007]
Epoch 1:  32%|███▏      | 13117/41242 [42:34<1:30:14,  5.19it/s, training_loss=0.018]
Epoch 1:  32%|███▏      | 13118/41242 [42:34<1:30:07,  5.20it/s, training_loss=0.018]
Epoch 1:  32%|███▏      | 13118/41242 [42:34<1:30:07,  5.20it/s, training_loss=0.018]
Epoch 1:  32%|███▏      | 13119/41242 [42:34<1:31:11,  5.14it/s, training_loss=0.018]
Epoch 1:  32%|███▏      | 13119/41242 [42:34<1:31:11,  5.14it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13120/41242 [42:34<1:30:13,  5.19it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13120/41242 [42:34<1:30:13,  5.19it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13121/41242 [42:34<1:31:19,  5.13it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13121/41242 [42:35<1:31:19,  5.13it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13122/41242 [42:35<1:30:03,  5.20it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13122/41242 [42:35<1:30:03,  5.20it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13123/41242 [42:35<1:29:01,  5.26it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13123/41242 [42:35<1:29:01,  5.26it/s, training_loss=0.006]
Epoch 1:  32%|███▏      | 13124/41242 [42:35<1:29:38,  5.23it/s, training_loss=0.006]
Epoch 1:  32%|███▏      | 13124/41242 [42:35<1:29:38,  5.23it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13125/41242 [42:35<1:29:18,  5.25it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13125/41242 [42:35<1:29:18,  5.25it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13126/41242 [42:35<1:28:19,  5.31it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13126/41242 [42:36<1:28:19,  5.31it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13127/41242 [42:36<1:28:23,  5.30it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13127/41242 [42:36<1:28:23,  5.30it/s, training_loss=0.100]
Epoch 1:  32%|███▏      | 13128/41242 [42:36<1:29:41,  5.22it/s, training_loss=0.100]
Epoch 1:  32%|███▏      | 13128/41242 [42:36<1:29:41,  5.22it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13129/41242 [42:36<1:28:37,  5.29it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13129/41242 [42:36<1:28:37,  5.29it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13130/41242 [42:36<1:28:17,  5.31it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13130/41242 [42:36<1:28:17,  5.31it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13131/41242 [42:36<1:29:41,  5.22it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13131/41242 [42:37<1:29:41,  5.22it/s, training_loss=0.018]
Epoch 1:  32%|███▏      | 13132/41242 [42:37<1:31:39,  5.11it/s, training_loss=0.018]
Epoch 1:  32%|███▏      | 13132/41242 [42:37<1:31:39,  5.11it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13133/41242 [42:37<1:31:02,  5.15it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13133/41242 [42:37<1:31:02,  5.15it/s, training_loss=0.011]
Epoch 1:  32%|███▏      | 13134/41242 [42:37<1:29:49,  5.22it/s, training_loss=0.011]
Epoch 1:  32%|███▏      | 13134/41242 [42:37<1:29:49,  5.22it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13135/41242 [42:37<1:28:38,  5.28it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13135/41242 [42:37<1:28:38,  5.28it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13136/41242 [42:37<1:28:02,  5.32it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13136/41242 [42:37<1:28:02,  5.32it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13137/41242 [42:37<1:28:07,  5.32it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13137/41242 [42:38<1:28:07,  5.32it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13138/41242 [42:38<1:28:51,  5.27it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13138/41242 [42:38<1:28:51,  5.27it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13139/41242 [42:38<1:28:06,  5.32it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13139/41242 [42:38<1:28:06,  5.32it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13140/41242 [42:38<1:29:25,  5.24it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13140/41242 [42:38<1:29:25,  5.24it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13141/41242 [42:38<1:28:47,  5.27it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13141/41242 [42:38<1:28:47,  5.27it/s, training_loss=0.067]
Epoch 1:  32%|███▏      | 13142/41242 [42:38<1:29:53,  5.21it/s, training_loss=0.067]
Epoch 1:  32%|███▏      | 13142/41242 [42:39<1:29:53,  5.21it/s, training_loss=0.428]
Epoch 1:  32%|███▏      | 13143/41242 [42:39<1:31:09,  5.14it/s, training_loss=0.428]
Epoch 1:  32%|███▏      | 13143/41242 [42:39<1:31:09,  5.14it/s, training_loss=0.019]
Epoch 1:  32%|███▏      | 13144/41242 [42:39<1:32:09,  5.08it/s, training_loss=0.019]
Epoch 1:  32%|███▏      | 13144/41242 [42:39<1:32:09,  5.08it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13145/41242 [42:39<1:30:36,  5.17it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13145/41242 [42:39<1:30:36,  5.17it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13146/41242 [42:39<1:29:26,  5.24it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13146/41242 [42:39<1:29:26,  5.24it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13147/41242 [42:39<1:30:57,  5.15it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13147/41242 [42:40<1:30:57,  5.15it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13148/41242 [42:40<1:31:31,  5.12it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13148/41242 [42:40<1:31:31,  5.12it/s, training_loss=0.045]
Epoch 1:  32%|███▏      | 13149/41242 [42:40<1:31:17,  5.13it/s, training_loss=0.045]
Epoch 1:  32%|███▏      | 13149/41242 [42:40<1:31:17,  5.13it/s, training_loss=0.474]
Epoch 1:  32%|███▏      | 13150/41242 [42:40<1:30:39,  5.16it/s, training_loss=0.474]
Epoch 1:  32%|███▏      | 13150/41242 [42:40<1:30:39,  5.16it/s, training_loss=0.825]
Epoch 1:  32%|███▏      | 13151/41242 [42:40<1:30:26,  5.18it/s, training_loss=0.825]
Epoch 1:  32%|███▏      | 13151/41242 [42:40<1:30:26,  5.18it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13152/41242 [42:40<1:29:33,  5.23it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13152/41242 [42:41<1:29:33,  5.23it/s, training_loss=0.370]
Epoch 1:  32%|███▏      | 13153/41242 [42:41<1:29:19,  5.24it/s, training_loss=0.370]
Epoch 1:  32%|███▏      | 13153/41242 [42:41<1:29:19,  5.24it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13154/41242 [42:41<1:29:27,  5.23it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13154/41242 [42:41<1:29:27,  5.23it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13155/41242 [42:41<1:29:13,  5.25it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13155/41242 [42:41<1:29:13,  5.25it/s, training_loss=0.509]
Epoch 1:  32%|███▏      | 13156/41242 [42:41<1:30:54,  5.15it/s, training_loss=0.509]
Epoch 1:  32%|███▏      | 13156/41242 [42:41<1:30:54,  5.15it/s, training_loss=0.009]
Epoch 1:  32%|███▏      | 13157/41242 [42:41<1:30:26,  5.18it/s, training_loss=0.009]
Epoch 1:  32%|███▏      | 13157/41242 [42:41<1:30:26,  5.18it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13158/41242 [42:41<1:29:38,  5.22it/s, training_loss=0.001]
Epoch 1:  32%|███▏      | 13158/41242 [42:42<1:29:38,  5.22it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13159/41242 [42:42<1:28:34,  5.28it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13159/41242 [42:42<1:28:34,  5.28it/s, training_loss=0.082]
Epoch 1:  32%|███▏      | 13160/41242 [42:42<1:28:26,  5.29it/s, training_loss=0.082]
Epoch 1:  32%|███▏      | 13160/41242 [42:42<1:28:26,  5.29it/s, training_loss=0.031]
Epoch 1:  32%|███▏      | 13161/41242 [42:42<1:28:37,  5.28it/s, training_loss=0.031]
Epoch 1:  32%|███▏      | 13161/41242 [42:42<1:28:37,  5.28it/s, training_loss=0.009]
Epoch 1:  32%|███▏      | 13162/41242 [42:42<1:29:01,  5.26it/s, training_loss=0.009]
Epoch 1:  32%|███▏      | 13162/41242 [42:42<1:29:01,  5.26it/s, training_loss=0.191]
Epoch 1:  32%|███▏      | 13163/41242 [42:42<1:29:25,  5.23it/s, training_loss=0.191]
Epoch 1:  32%|███▏      | 13163/41242 [42:43<1:29:25,  5.23it/s, training_loss=0.005]
Epoch 1:  32%|███▏      | 13164/41242 [42:43<1:31:43,  5.10it/s, training_loss=0.005]
Epoch 1:  32%|███▏      | 13164/41242 [42:43<1:31:43,  5.10it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13165/41242 [42:43<1:32:55,  5.04it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13165/41242 [42:43<1:32:55,  5.04it/s, training_loss=0.125]
Epoch 1:  32%|███▏      | 13166/41242 [42:43<1:33:56,  4.98it/s, training_loss=0.125]
Epoch 1:  32%|███▏      | 13166/41242 [42:43<1:33:56,  4.98it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13167/41242 [42:43<1:32:32,  5.06it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13167/41242 [42:43<1:32:32,  5.06it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13168/41242 [42:43<1:31:11,  5.13it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13168/41242 [42:44<1:31:11,  5.13it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13169/41242 [42:44<1:30:25,  5.17it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13169/41242 [42:44<1:30:25,  5.17it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13170/41242 [42:44<1:29:35,  5.22it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13170/41242 [42:44<1:29:35,  5.22it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13171/41242 [42:44<1:31:26,  5.12it/s, training_loss=0.002]
Epoch 1:  32%|███▏      | 13171/41242 [42:44<1:31:26,  5.12it/s, training_loss=0.005]
Epoch 1:  32%|███▏      | 13172/41242 [42:44<1:33:26,  5.01it/s, training_loss=0.005]
Epoch 1:  32%|███▏      | 13172/41242 [42:44<1:33:26,  5.01it/s, training_loss=0.103]
Epoch 1:  32%|███▏      | 13173/41242 [42:44<1:33:03,  5.03it/s, training_loss=0.103]
Epoch 1:  32%|███▏      | 13173/41242 [42:45<1:33:03,  5.03it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13174/41242 [42:45<1:32:28,  5.06it/s, training_loss=0.004]
Epoch 1:  32%|███▏      | 13174/41242 [42:45<1:32:28,  5.06it/s, training_loss=0.005]
Epoch 1:  32%|███▏      | 13175/41242 [42:45<1:33:12,  5.02it/s, training_loss=0.005]
Epoch 1:  32%|███▏      | 13175/41242 [42:45<1:33:12,  5.02it/s, training_loss=0.009]
Epoch 1:  32%|███▏      | 13176/41242 [42:45<1:34:47,  4.93it/s, training_loss=0.009]
Epoch 1:  32%|███▏      | 13176/41242 [42:45<1:34:47,  4.93it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13177/41242 [42:45<1:32:42,  5.05it/s, training_loss=0.003]
Epoch 1:  32%|███▏      | 13177/41242 [42:45<1:32:42,  5.05it/s, training_loss=0.027]
Buffered data was truncated after reaching the output size limit.

**Test model with original data**

In [ ]:
df = incidentsData.copy()
In [ ]:
# Lets find out Groups with ticket count only 1, for such groups we cant split the data set in a stratified fashion
minor_df = df.groupby('Assignment group').filter(lambda x: len(x) <= 1)

# Treat the imbalnce in the 'other' dataset by resampling
from sklearn.utils import resample
incidentsData_upsampled = minor_df[0:0]

# Upsample minority class
for grp in minor_df['Assignment group'].unique():
    incidentsData_Group = df[df['Assignment group'] == grp]
    resampled = resample(incidentsData_Group, 
                         replace=True, # sample with replacement
                         n_samples=2, 
                         random_state=123) # reproducible results
    
    incidentsData_upsampled = incidentsData_upsampled.append(resampled)

frames = [df,incidentsData_upsampled]
df = pd.concat(frames)
In [ ]:
df, X, label_dict  = get_transormedX_and_labels(df)

x_train, x_test, y_train, y_test  = train_test_split(X.index.values,
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=df.label.values)

df['data_type'] = ['not_set']*df.shape[0]

df.loc[x_train, 'data_type'] = 'train'
df.loc[x_test, 'data_type'] = 'val'

dataset_train, dataset_val = get_train_val_dataset(df)

batch_size = 32
dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)
						

In [ ]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

model.load_state_dict(torch.load(project_path + 'data_volume/over_finetuned_BERT_epoch_2.model', map_location=torch.device('cpu')))

avg_val_loss, predictions, true_vals, avg_val_accuracy = evaluate(dataloader_validation)
print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
print("  Validation Loss: {0:.2f}".format(avg_val_loss))
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  Accuracy: 0.90
  Validation Loss: 0.74
In [ ]:
# From above report
update_model_score('BERT with Over-Sampling for training','-',0.90,'-', '-','-')

**3. RoBERTa**

In [ ]:
import re
!pip install transformers==3.0.0
from fastai.text import *
from fastai.metrics import *
from transformers import RobertaTokenizer
In [ ]:
from sklearn.utils import resample

incidentsData_upsampled = incidentsData_Others[0:0]

# Upsample minority class
for grp in incidentsData_Others['Assignment group'].unique():
    incidentsData_Group = incidentsData_Others[incidentsData_Others['Assignment group'] == grp]
    resampled = resample(incidentsData_Group, 
                         replace=True, # sample with replacement
                         n_samples=200, 
                         random_state=123) # reproducible results
    
    incidentsData_upsampled = incidentsData_upsampled.append(resampled)

incidentsData_Others_upsample_small = pd.concat([incidentsData_Group_minor[incidentsData_Group_minor['Assignment group']=='GRP_0'],incidentsData_upsampled])
incidentsData_Others_upsample_small.reset_index(inplace=True)
In [ ]:
df=incidentsData_Others_upsample_small.copy()

df['Assignment group'] = df['Assignment group'].apply(lambda x: re.sub(r'([a-zA-Z+_])',"", x))
df['Assignment group'] = df['Assignment group'].astype(int)
df.drop(columns='Short description', inplace=True)
df.drop(columns='Description', inplace=True)
df.drop(columns='Description_pos_tagged', inplace=True)
df.drop(columns='Caller',inplace=True)
label = max(df['Assignment group']) +1 
In [ ]:
# Creating a config object to store task specific information
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    testing=False,
    seed = 2019,
    roberta_model_name='roberta-base', 
    max_lr=1e-5,
    epochs=5,
    use_fp16=False,
    bs=4, 
    max_seq_len=256,
    num_labels = label,
    hidden_dropout_prob=.05,
    hidden_size=768, 
    start_tok = "<s>",
    end_tok = "</s>",
)

class FastAiRobertaTokenizer(BaseTokenizer):
    """Wrapper around RobertaTokenizer to be compatible with fastai"""
    def __init__(self, tokenizer: RobertaTokenizer, max_seq_len: int=128, **kwargs): 
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len 
    def __call__(self, *args, **kwargs): 
        return self 
    def tokenizer(self, t:str) -> List[str]: 
        """Adds Roberta bos and eos tokens and limits the maximum sequence length""" 
        return [config.start_tok] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + [config.end_tok]
In [ ]:
feat_cols = "New_Description"
label_cols = "Assignment group"
# create fastai tokenizer for roberta
roberta_tok = RobertaTokenizer.from_pretrained("roberta-base")

fastai_tokenizer = Tokenizer(tok_func=FastAiRobertaTokenizer(roberta_tok, max_seq_len=config.max_seq_len), 
                             pre_rules=[], post_rules=[])


In [ ]:
# create fastai vocabulary for roberta
path = Path()
roberta_tok.save_vocabulary(path)

with open('vocab.json', 'r') as f:
    roberta_vocab_dict = json.load(f)
    
fastai_roberta_vocab = Vocab(list(roberta_vocab_dict.keys()))
In [ ]:
# Setting up pre-processors
class RobertaTokenizeProcessor(TokenizeProcessor):
    def __init__(self, tokenizer):
         super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False)

class RobertaNumericalizeProcessor(NumericalizeProcessor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)


def get_roberta_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
    """
    Constructing preprocessors for Roberta
    We remove sos and eos tokens since we add that ourselves in the tokenizer.
    We also use a custom vocabulary to match the numericalization with the original Roberta model.
    """
    return [RobertaTokenizeProcessor(tokenizer=tokenizer), RobertaNumericalizeProcessor(vocab=vocab)]
In [ ]:
# Creating a Roberta specific DataBunch class
class RobertaDataBunch(TextDataBunch):
    "Create a `TextDataBunch` suitable for training Roberta"
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=64, val_bs:int=None, pad_idx=1,
               pad_first=True, device:torch.device=None, no_check:bool=False, backwards:bool=False, 
               dl_tfms:Optional[Collection[Callable]]=None, **dl_kwargs) -> DataBunch:
        "Function that transform the `datasets` in a `DataBunch` for classification. Passes `**dl_kwargs` on to `DataLoader()`"
        datasets = cls._init_ds(train_ds, valid_ds, test_ds)
        val_bs = ifnone(val_bs, bs)
        collate_fn = partial(pad_collate, pad_idx=pad_idx, pad_first=pad_first, backwards=backwards)
        train_sampler = SortishSampler(datasets[0].x, key=lambda t: len(datasets[0][t][0].data), bs=bs)
        train_dl = DataLoader(datasets[0], batch_size=bs, sampler=train_sampler, drop_last=True, **dl_kwargs)
        dataloaders = [train_dl]
        for ds in datasets[1:]:
            lengths = [len(t) for t in ds.x.items]
            sampler = SortSampler(ds.x, key=lengths.__getitem__)
            dataloaders.append(DataLoader(ds, batch_size=val_bs, sampler=sampler, **dl_kwargs))
        return cls(*dataloaders, path=path, device=device, dl_tfms=dl_tfms, collate_fn=collate_fn, no_check=no_check)

class RobertaTextList(TextList):
    _bunch = RobertaDataBunch
    _label_cls = TextList
In [ ]:
# loading the tokenizer and vocab processors
processor = get_roberta_processor(tokenizer=fastai_tokenizer, vocab=fastai_roberta_vocab)

# creating our databunch 
data = RobertaTextList.from_df(df, ".", cols=feat_cols, processor=processor) \
    .split_by_rand_pct(seed=config.seed) \
    .label_from_df(cols=label_cols,label_cls=CategoryList) \
    .databunch(bs=config.bs, pad_first=False, pad_idx=0)
/usr/local/lib/python3.6/dist-packages/fastai/core.py:302: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
  return np.array(a, dtype=dtype, **kwargs)
In [ ]:
import torch
import torch.nn as nn
from transformers import RobertaModel

# defining our model architecture 
class CustomRobertaModel(nn.Module):
    def __init__(self,num_labels=73):
        super(CustomRobertaModel,self).__init__()
        self.num_labels = num_labels
        self.roberta = RobertaModel.from_pretrained(config.roberta_model_name)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels) # defining final output layer
        
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _ , pooled_output = self.roberta(input_ids, token_type_ids, attention_mask)
        logits = self.classifier(pooled_output)        
        return logits
In [ ]:
CUDA_LAUNCH_BLOCKING=1
roberta_model = CustomRobertaModel(num_labels=config.num_labels)

learn = Learner(data, roberta_model, metrics=[accuracy])


In [ ]:
learn.model.roberta.train() # setting roberta to train as it is in eval mode by default
learn.fit_one_cycle(config.epochs, max_lr=config.max_lr)
epoch train_loss valid_loss accuracy time
0 2.420797 2.280798 0.525094 07:54
1 0.874999 0.838387 0.800054 07:56
2 0.459578 0.457809 0.882893 07:55
3 0.160908 0.354494 0.915542 07:56
4 0.156715 0.335501 0.923368 07:56
In [ ]:
def get_preds_as_nparray(ds_type) -> np.ndarray:
    learn.model.roberta.eval()
    preds = learn.get_preds(ds_type)[0].detach().cpu().numpy()
    sampler = [i for i in data.dl(ds_type).sampler]
    reverse_sampler = np.argsort(sampler)
    ordered_preds = preds[reverse_sampler, :]
    pred_values = np.argmax(ordered_preds, axis=1)
    return ordered_preds, pred_values
In [ ]:
preds, pred_values = get_preds_as_nparray(DatasetType.Valid)
In [ ]:
# accuracy on valid
test_accuracy = (pred_values == data.valid_ds.y.items).mean()
print(test_accuracy)
0.9233675121424717
In [ ]:
def save_model(learner, file_name):
    st = learner.model.state_dict()
    torch.save(st, file_name) # will save model in current dir # backend is pickle 

def load_model(learner, file_name):
    st = torch.load(file_name)
    learner.model.load_state_dict(st)

# monkey patching Learner methods to save and load model file
Learner.save_model = save_model
Learner.load_model = load_model
In [ ]:
# From above report
update_model_score('RoBERTa with Over Sampling','-',test_accuracy,'-', '-','-')
Out[ ]:
Model Train_Acc Test_Acc Precision Recall F1_Score
0 RoBERTa with Over Sampling - 0.923368 - - -
In [ ]:
save_model_result()

**Final Model Selection**

In [ ]:
resultsDf = get_model_result_fromFile()
In [ ]:
resultsDf
Out[ ]:
New_ID Model Train_Acc Test_Acc Precision Recall F1_Score
0 1 ExtraTrees - Original Data 0.992870934 0.668646 0.658952685 0.711757269 0.684337867
1 2 SVM - Original Data 0.860686173 0.691211 0.67933036 0.741873805 0.709225897
2 3 Multinomial Naïve Bayes - Original Data 0.557700876 0.559976 0.594468028 0.85494107 0.701299768
3 4 Naïve Bayes - Original Data 0.934798752 0.571259 0.596109492 0.577777778 0.586800499
4 5 SGD Classifier - Original Data 0.703698203 0.631235 0.639680854 0.693411611 0.665463414
5 6 Decision Tree - Original Data 0.992870934 0.589667 0.595145411 0.595323741 0.595234563
6 7 Random Forest - Original Data 0.990197535 0.605107 0.653851539 0.634100809 0.643824735
7 8 AdaBoost - Original Data 0.513886826 0.523753 0.52093719 0.950431034 0.672999272
8 9 Bagging - Original Data 0.99257389 0.665677 0.658028094 0.684371184 0.670941162
9 10 Gradient Boosting - Original Data 0.982919947 0.630641 0.613589302 0.637072585 0.625110475
10 11 Bagging with Over-Sampling 0.990603288 0.982647 0.98414043 0.982647009 0.983393152
11 12 ExtraTrees with Over-Sampling 0.990899847 0.989019 0.990434274 0.98901881 0.989726036
12 13 LSTM with Word2Vec(Simple Averaging) 0.4640921 0.482779 - - -
13 14 LSTM with Word2Vec(TF-IDF Weighted Averaging) 0.464649099 0.482779 - - -
14 15 LSTM with Doc2Vec 0.466394365 0.482779 - - -
15 16 LSTM with FastText 0.527330112 0.564133 0.839545369 0.354599059 0.495131284
16 17 LSTM with Glove 0.626680285 0.634204 0.756910622 0.566981137 0.64677012
17 18 LSTM with Glove using SMOTE 0.950081855 0.983426 0.993631899 0.975291908 0.984238148
18 19 RNN with Glove with SMOTE 0.961369842 0.973035 0.990182221 0.963907957 0.976669252
19 20 GRU with Glove with SMOTE 0.953051579 0.982980 0.98321569 0.982692301 0.982949674
20 21 XLNet with Over-Sampling - 0.985765 0.99 0.99 0.98
21 22 BERT with Original Data - 0.690000 - - -
22 23 BERT with Over-Sampling for training - 0.900000 - - -
23 24 RoBERTa with Over Sampling - 0.923368 - - -

**Conclusion**:
As we can see ExtraTrees(98.9%)and XLNet(98.5%) are giving highest accuracy. XLnet is latest model and it is a very promising and potential one. In real word, XLNet is a generalized autoregressive pretraining method.

We have decided to go ahead with **XLNet Model**

**Model Performance on Production Data**

Lets perform this model on production (unseen) data.

We tried to test Model Performance on Production data to check how model is performing. We have used upsampling so by testing model with Production data we can check model performance

**Reload XLNet**

In [ ]:
!pip install transformers -q
% pip install sentencepiece
!pip install torch
In [ ]:
from transformers import (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
import torch
from tqdm import tqdm,trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

import pandas as pd
import math
import numpy as np
from sklearn.metrics import classification_report
import torch.nn.functional as F
In [ ]:
text = TextPreprocessor(n_jobs=-1).transform(incidentsData['New_Description'])
In [ ]:
# Get sentence data
sentences = text.to_list()
labels = incidentsData['Assignment group'].to_list()
In [ ]:
from sklearn import preprocessing
def labelencoder(dataframe) : 
  label_encoder = preprocessing.LabelEncoder() 
  dataframe= label_encoder.fit_transform(dataframe)
  grp_mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))
  return grp_mapping

tag2idx = labelencoder(incidentsData['Assignment group'])
# Mapping index to name
tag2name={tag2idx[key] : key for key in tag2idx.keys()}
Load model
In [ ]:
xlnet_out_address = project_path + 'Embeddings/xlnet/models/xlnet_out_model/tc04'
In [ ]:
model = XLNetForSequenceClassification.from_pretrained(xlnet_out_address,num_labels=len(tag2idx))
In [ ]:
# Set model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
model.to(device);
In [ ]:
if n_gpu >1:
    model = torch.nn.DataParallel(model)
Load Test Data
In [ ]:
inc_testdataLoader = IncidentDataloader(project_path+ "Test_data.xlsx")
testData = inc_testdataLoader.loadIncidents()
In [ ]:
print("Number of Null values in dataset: ",inc_testdataLoader.getNullRowCount(testData))
Number of Null values in dataset:  0
In [ ]:
testData = inc_dataLoader.removeDuplicateRows(testData)
testData = inc_dataLoader.combineDescription(testData)
In [ ]:
text = TextPreprocessor(n_jobs=-1).transform(testData['New_Description'])
# Get sentence data
sentences = text.to_list()
labels = testData['Assignment group'].to_list()
In [ ]:
names = testData['Assignment group'].unique().tolist()
grpID=[]
for grp in names:
  grpID.append(int(grp.replace("GRP_","")))

grp_mapping = dict(zip(grpID,names))

tag2idx = grp_mapping
# Mapping index to name
tag2name={tag2idx[key] : key for key in tag2idx.keys()}
In [ ]:
tags = [tag2name[str(lab)] for lab in labels]
In [ ]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

from transformers import XLNetTokenizer
vocabulary =  project_path + 'Embeddings/xlnet/models/xlnet-base-cased-spiece.model'
max_len  = 100
tokenizer = XLNetTokenizer(vocab_file=vocabulary,do_lower_case=False)
In [ ]:
max_len  = 100

full_input_ids = []
full_input_masks = []
full_segment_ids = []

SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

UNK_ID = tokenizer.encode("<unk>")[0]
CLS_ID = tokenizer.encode("<cls>")[0]
SEP_ID = tokenizer.encode("<sep>")[0]
MASK_ID = tokenizer.encode("<mask>")[0]
EOD_ID = tokenizer.encode("<eod>")[0]

for i,sentence in enumerate(sentences):
    # Tokenize sentence to token id list
    tokens_a = tokenizer.encode(sentence)
    
    # Trim the len of text
    if(len(tokens_a)>max_len-2):
        tokens_a = tokens_a[:max_len-2]
        
        
    tokens = []
    segment_ids = []
    
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(SEG_ID_A)
        
    # Add <sep> token 
    tokens.append(SEP_ID)
    segment_ids.append(SEG_ID_A)
    
    
    # Add <cls> token
    tokens.append(CLS_ID)
    segment_ids.append(SEG_ID_CLS)
    
    input_ids = tokens
    
    # The mask has 0 for real tokens and 1 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [0] * len(input_ids)

    # Zero-pad up to the sequence length at fornt
    if len(input_ids) < max_len:
        delta_len = max_len - len(input_ids)
        input_ids = [0] * delta_len + input_ids
        input_mask = [1] * delta_len + input_mask
        segment_ids = [SEG_ID_PAD] * delta_len + segment_ids

    assert len(input_ids) == max_len
    assert len(input_mask) == max_len
    assert len(segment_ids) == max_len
    
    full_input_ids.append(input_ids)
    full_input_masks.append(input_mask)
    full_segment_ids.append(segment_ids)
In [ ]:
# Make label into id
tags = [tag2name[str(lab)] for lab in labels]

test_inputs = torch.tensor(full_input_ids)
test_tags = torch.tensor(tags)
test_masks = torch.tensor(full_input_masks)
test_segs = torch.tensor(full_segment_ids)

# Set batch num
batch_num = 32
test_data = TensorDataset(test_inputs, test_masks,test_segs, test_tags)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, batch_size=batch_num)
Eval model
In [ ]:
# Evalue loop
model.eval();
In [ ]:
# Set acc funtion
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)
    
In [ ]:
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

y_true = []
y_predict = []
print("***** Running evaluation *****")
print("  Num examples ={}".format(len(test_inputs)))
print("  Batch size = {}".format(batch_num))
for step, batch in enumerate(test_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_segs,b_labels = batch
    
    with torch.no_grad():
        outputs = model(input_ids =b_input_ids,token_type_ids=b_segs, input_mask = b_input_mask,labels=b_labels)
        tmp_eval_loss, logits = outputs[:2]
    
    # Get textclassification predict result
    logits = logits.detach().cpu().numpy() 
    label_ids = b_labels.to('cpu').numpy()    
    tmp_eval_accuracy = accuracy(logits, label_ids)
    
    # Save predict and real label reuslt for analyze
    for predict in np.argmax(logits, axis=1):
        y_predict.append(predict)
        
    for real_result in label_ids.tolist():
        y_true.append(real_result)

    
    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy
   
    nb_eval_steps += 1
    
    
eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / len(test_inputs)

result = {'eval_loss': eval_loss,
                  'eval_accuracy': eval_accuracy
                  }
report = classification_report(y_pred=np.array(y_predict),y_true=np.array(y_true))

# Save the report into file
output_eval_file = os.path.join(xlnet_out_address, "eval_results.txt")
with open(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    for key in sorted(result.keys()):
        print("  %s = %s"%(key, str(result[key])))
        writer.write("%s = %s\n" % (key, str(result[key])))
        
    
    writer.write("\n\n")  
    writer.write(report)
***** Running evaluation *****
  Num examples =20
  Batch size = 32
***** Eval results *****
  eval_accuracy = 0.9
  eval_loss = 0.35486650466918945

**Conclusion**

As per above result, Test Accuracy on Production Data is 90% which is quite good. And with Test data peovided as part this assignment accuracy is 98.5%.

So, we can deploy this model in Production. As next steps, we can develop REST API using Flask RestFul for this model so that it can be integrated with another Web-app or Mobile-app.

In [ ]: